scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28524B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Formal Verification of LLM-Generated Code from Natural Language Prompts",
      6     "authors": [
      7       "Aaron Councilman",
      8       "David Fu",
      9       "Aryan Gupta",
     10       "Chengxiao Wang",
     11       "David Grove"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2507.13290",
     16     "doi": "10.48550/arXiv.2507.13290"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All four key abstract claims — the FQL design, Astrogator implementation, 83% correct-code verification, and 92% incorrect-code identification — are directly supported by the evaluation in Section 6.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims 'all incorrect rejections are likely fixable' and that GPT-4o's superior performance is 'most likely because the open source models are much smaller,' neither of which is empirically tested — these are speculative causal claims on a 21-task benchmark.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The 83%/92% accuracy figures rest on only 21 benchmark tasks created by the authors; the paper sometimes claims all failures are addressable without acknowledging that a 21-task benchmark cannot support broad generalization claims about the approach.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider that the benchmark may favor tasks the system was designed to handle, or that the 83% verification rate might partly reflect easy benchmark selection rather than approach strength.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges that test-based ground truth is imperfect: 'we use tests because, even though they are imperfect, no other solution was feasible,' clearly distinguishing between test-passing and true correctness.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; failure analysis is spread across Section 6.3 and Section 7, but these are framed as 'addressable implementation issues' rather than honest limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No formal threats-to-validity analysis exists; the paper does not discuss threats such as benchmark selection bias (authors created the 21 tasks), potential contamination of LLMs on common Ansible patterns, or the implication of using imperfect test-based ground truth.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper clearly states it focuses on Ansible as a DSL, explicitly notes that Bash and Arduino support are left to future work, and acknowledges the approach does not currently handle arbitrary while-loops or shell commands.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed: IBM-ILLINOIS Discovery Accelerator Institute (IIDAI) and NSF via the Delta computing allocation (OAC 2005572 and ACCESS grants).",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All affiliations are disclosed: four UIUC authors and David Grove from IBM Research, Yorktown Heights.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "IBM both funds the research and has a co-author (David Grove, IBM Research); IBM also owns Red Hat, the company behind Ansible — the sole target language of Astrogator — creating a non-trivial conflict of interest.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is provided; there is no declaration of patents, equity, or consulting relationships beyond the funding acknowledgment.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including 'correctness,' 'formal specification,' 'Formal Query Language,' and 'State Calculus' are precisely defined with formal notation in Sections 3–5.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 lists four explicit, numbered contributions: formalizing NL-to-code correctness, proposing the FQL concept, implementing Astrogator for Ansible, and evaluating on 21 benchmarks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 8 (Related Work) engages substantively with CNLs, test-based LLM code validation, proof-assistant generation, autoformalization, and program synthesis, explaining how Astrogator differs from each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository or release URL is mentioned anywhere in the paper; the system exists but is not publicly available.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The 21-task benchmark is described in Appendix A (natural language descriptions and queries) but the 1,260 generated programs, VM test scripts, and other evaluation artifacts are not released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file, Dockerfile, or dependency specification is provided; VM OS versions are mentioned (Debian 12.11.0, Ubuntu 24.04.2, RHEL 9.6) but the Astrogator runtime environment is unspecified.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the pipeline is described conceptually but cannot be followed without the unreleased code and VM setup scripts.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper reports only point estimates (82.9%, 92.4%) with no confidence intervals or error bars around the main verification accuracy results.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims, including the GPT-4o vs. open-source model performance comparison.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Verification accuracy rates (82.9% true positive, 92.4% true negative) are reported with absolute counts and denominators, providing interpretable effect magnitudes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of 21 benchmark tasks and 10 programs per model is not justified by power analysis or any principled argument about statistical adequacy.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or run-to-run variability is reported; the 10 programs per model per task are treated as a diversity measure, not a repeated-measure for variance estimation.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No baseline verification approach is compared against Astrogator; there is no comparison to test-only validation, static analysis, or other formal methods applied to LLM-generated code.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "No baselines are included in the evaluation.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study is conducted; the contribution of the Knowledge Base, State Calculus, or FQL design individually is not evaluated.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The evaluation reports true positive rate (correct programs accepted), false negative rate (correct programs rejected), false positive rate (incorrect programs accepted), and true negative rate, along with per-model and per-task breakdowns.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Although the full Astrogator system envisions user review of queries and assumptions, the evaluation bypasses this and uses automated test-based ground truth; no human evaluators assess system outputs.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "The 21-task benchmark was created by the paper's authors and is the only evaluation set; the FQL and verifier were designed with knowledge of these task types, creating potential evaluation bias.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 4 provides per-benchmark-problem breakdowns of accepted/rejected correct and incorrect programs; Table 3 provides per-model breakdowns of error types.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 6.3 provides detailed analysis of all 57 false rejections and 70 false accepts, categorizing them by root cause with specific examples.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "False accepts and false rejects are reported with root-cause analysis; the paper acknowledges 130 programs using unsupported shell commands and multiple systematic failure modes.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Open-source model sizes are specified (e.g., 'Deepseek Coder 6.7b,' 'Llama 3.1 8b') but GPT-4o lacks a snapshot date or version identifier, making the closed-source result non-reproducible.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The full code-generation prompt is provided verbatim in Appendix B, including the instruction not to use shell commands.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or sampling parameters are reported for any of the six LLMs used in the evaluation.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Astrogator's architecture — FQL compiler, State Calculus, symbolic interpreter, and unifier — is described in detail in Sections 5.1–5.5, sufficient to understand what was tested.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Post-processing is mentioned ('post-processing to identify and extracts key elements and insert them into a template') but the specific extraction logic and template are not provided.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The 1,260 generated programs, VM test results, and Astrogator outputs are not released; only aggregate statistics are reported.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Benchmark task sources are described: top StackOverflow posts, Ansible Forum, Ansible Galaxy examples, and author-constructed variations; VM testing setup (three OS environments, snapshot resets) is described.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; programs are generated by LLMs and evaluated by automated tests.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The pipeline (generate → post-process → run on VM → test) is conceptually described but specific scripts, test implementations, and setup procedures are not released or fully documented.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training cutoff dates are stated for any of the six LLMs; GPT-4o's cutoff in particular is unspecified, and the open-source models' cutoffs are not discussed.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The benchmark is based on common Ansible tasks from StackOverflow and Ansible Forum — sources that are certainly in LLM training data — but this potential overlap is never discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The 21 benchmark tasks are based on common public Ansible programming patterns; the possibility that LLMs have memorized solutions to these exact tasks is not addressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Verification latency is reported (~70 seconds for 1,260 programs), but LLM inference cost for generating the 1,260 programs is not reported.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The NSF Delta allocation is mentioned but no GPU-hours, API costs, or total compute budget is stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Astrogator verifies correct LLM-generated Ansible code in 82.9% of cases on the 21-task benchmark.",
    375       "evidence": "277 of 334 correct programs accepted across 21 benchmark tasks, 1,260 total programs (Table 4).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Astrogator identifies incorrect LLM-generated Ansible code in 92.4% of cases.",
    380       "evidence": "856 of 926 incorrect programs rejected (Table 4); directly measured.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "GPT-4o generates correct Ansible code 51.4% of the time, significantly outperforming open-source models (~21.5%).",
    385       "evidence": "Table 3 shows GPT-4o at 108/210 correct vs. 21–55/210 for open-source models; no significance test.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "All verification false rejections are due to addressable implementation limitations rather than fundamental flaws in the approach.",
    390       "evidence": "Manual analysis of 57 false rejections attributes them to unsupported features (18) and knowledge base single-answer constraint (39); no empirical test of fixability.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "The State Calculus generalizes to Bash and Arduino programs.",
    395       "evidence": "Two hand-written translation examples (Figures 6 and 7) verified manually; no automated pipeline or benchmark for either language.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "68 of 70 false accepts are expected consequences of under-specified queries and would be resolved by user review.",
    400       "evidence": "Manual analysis of accepted incorrect programs categorizes them as assumption violations (61) or undesired additional actions (7); user review step was bypassed in evaluation.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study",
    407     "theoretical"
    408   ],
    409   "key_findings": "Astrogator, a formal verification system for LLM-generated Ansible code, achieves 82.9% true positive rate (correct programs verified) and 92.4% true negative rate (incorrect programs rejected) on an author-constructed 21-task benchmark with 1,260 programs from 6 LLMs. GPT-4o substantially outperforms open-source models (51.4% vs ~21.5% correctness). All 57 false rejections are attributed to implementation gaps (unsupported Ansible features, knowledge base rigidity) rather than theoretical limitations. The State Calculus verification approach runs in ~70 seconds for 1,260 programs, suggesting tractable performance for DSL verification.",
    410   "red_flags": [
    411     {
    412       "flag": "Tiny benchmark",
    413       "detail": "Only 21 benchmark tasks, all created by the authors based on common Ansible patterns. This is far too small to support broad accuracy claims; per-task results in Table 4 show extreme variance (some tasks 0% correct, others 100%)."
    414     },
    415     {
    416       "flag": "No baselines",
    417       "detail": "Astrogator is evaluated in isolation with no comparison to alternative verification approaches, static analysis tools, or test-only validation, making it impossible to assess relative effectiveness."
    418     },
    419     {
    420       "flag": "Author-constructed benchmark",
    421       "detail": "The same authors who built Astrogator created the 21 evaluation tasks, selecting task types the system was designed to handle. The formal query language was co-designed with the benchmark, creating circular evaluation."
    422     },
    423     {
    424       "flag": "No reproducibility artifacts",
    425       "detail": "No code, benchmark programs, VM test scripts, or environment specifications are released; the system cannot be reproduced or compared against."
    426     },
    427     {
    428       "flag": "Contamination unaddressed",
    429       "detail": "Benchmark tasks are derived from StackOverflow and Ansible Forum — sources certainly in LLM training corpora — but potential memorization of solutions is never discussed."
    430     },
    431     {
    432       "flag": "IBM conflict of interest",
    433       "detail": "IBM both funds the research and has a co-author (IBM Research); IBM also owns Red Hat (Ansible), making IBM-affiliated researchers the evaluators of a tool applied to an IBM-owned language."
    434     },
    435     {
    436       "flag": "GPT-4o version unspecified",
    437       "detail": "GPT-4o lacks a snapshot date, making the closed-source results non-reproducible and potentially stale."
    438     },
    439     {
    440       "flag": "Failures dismissed as fixable",
    441       "detail": "The conclusion that 'all errors are results of limitations in our testing setup or in the implementation that could be easily addressed' is a strong claim that overstates certainty given the limited evaluation scope."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    447       "relevance": "Foundational benchmark for LLM code generation; paper evaluates against this as a reference for LLM coding capability."
    448     },
    449     {
    450       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    451       "relevance": "Key benchmark for LLM performance on practical programming tasks; cited as evidence that LLMs struggle with complex real-world coding."
    452     },
    453     {
    454       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    455       "relevance": "Studies how users verify LLM-generated code; motivates the need for formal verification by showing users struggle to check LLM output."
    456     },
    457     {
    458       "title": "Do Users Write More Insecure Code with AI Assistants?",
    459       "relevance": "Demonstrates security risks of AI code assistants, motivating the paper's safety-critical framing."
    460     },
    461     {
    462       "title": "Can Large Language Models Transform Natural Language Intent into Formal Method Postconditions?",
    463       "relevance": "Directly related work on using LLMs to generate formal specifications; compared in Related Work section."
    464     },
    465     {
    466       "title": "Automated Code Generation for IT Tasks in YAML through Large Language Models",
    467       "relevance": "Prior work on LLM-based Ansible code generation; used as benchmark comparison for task types and playbook sizes."
    468     },
    469     {
    470       "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    471       "relevance": "Related approach using LLMs to generate proofs in proof assistants; compared as an alternative verification strategy."
    472     },
    473     {
    474       "title": "Autoformalization with Large Language Models",
    475       "relevance": "Related work on converting natural language theorem statements to formal specifications using LLMs."
    476     },
    477     {
    478       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    479       "relevance": "Contemporary benchmark for code generation; cited as evidence that LLMs struggle with complex programming tasks."
    480     },
    481     {
    482       "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation",
    483       "relevance": "Documents hallucination patterns in LLM code generation including non-existent APIs; motivates formal verification approach."
    484     }
    485   ],
    486   "engagement_factors": {
    487     "practical_relevance": {
    488       "score": 2,
    489       "justification": "Addresses a real pain point (verifying LLM-generated code for critical systems) but is currently limited to Ansible and requires significant engineering to extend."
    490     },
    491     "surprise_contrarian": {
    492       "score": 1,
    493       "justification": "The approach (using a human-readable formal query language as a bridge to verification) is novel but not contrarian; formal verification of LLM code is an expected research direction."
    494     },
    495     "fear_safety": {
    496       "score": 2,
    497       "justification": "Explicitly targets safety-critical and mission-critical applications; frames LLM code generation failures as causing 'disastrous impacts' in network stacks, distributed systems, and embedded controllers."
    498     },
    499     "drama_conflict": {
    500       "score": 0,
    501       "justification": "No controversy; the paper presents a system paper with no provocative claims about existing tools or communities."
    502     },
    503     "demo_ability": {
    504       "score": 1,
    505       "justification": "The system exists but is not publicly released; readers cannot try it without access to the unreleased codebase."
    506     },
    507     "brand_recognition": {
    508       "score": 1,
    509       "justification": "UIUC and IBM Research affiliations are recognizable; evaluates GPT-4o, a well-known model, but no famous lab product is the main subject."
    510     }
    511   },
    512   "hn_data": {
    513     "threads": [
    514       {
    515         "hn_id": "44268286",
    516         "title": "Geometry from Quantum Temporal Correlations",
    517         "points": 60,
    518         "comments": 27,
    519         "url": "https://news.ycombinator.com/item?id=44268286",
    520         "created_at": "2025-06-13T13:21:47Z"
    521       },
    522       {
    523         "hn_id": "43847316",
    524         "title": "EDGS: Eliminating Densification for Efficient Convergence of 3DGS",
    525         "points": 2,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=43847316",
    528         "created_at": "2025-04-30T16:16:24Z"
    529       },
    530       {
    531         "hn_id": "43844343",
    532         "title": "Let Me Grok for You: Accelerating Grokking via Embedding Transfer",
    533         "points": 2,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=43844343",
    536         "created_at": "2025-04-30T12:38:42Z"
    537       },
    538       {
    539         "hn_id": "36321227",
    540         "title": "Correct Compilation of Semiring Contractions (2022)",
    541         "points": 2,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=36321227",
    544         "created_at": "2023-06-14T03:53:22Z"
    545       },
    546       {
    547         "hn_id": "27997501",
    548         "title": "So you want to analyze Scheme programs with Datalog?",
    549         "points": 2,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=27997501",
    552         "created_at": "2021-07-29T15:13:22Z"
    553       },
    554       {
    555         "hn_id": "43182283",
    556         "title": "Demonstrating specification gaming in reasoning models",
    557         "points": 1,
    558         "comments": 1,
    559         "url": "https://news.ycombinator.com/item?id=43182283",
    560         "created_at": "2025-02-26T09:49:49Z"
    561       },
    562       {
    563         "hn_id": "44465492",
    564         "title": "Few-Shot Learning for Industrial Time Series: Screw-Fastening Process Monitoring",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=44465492",
    568         "created_at": "2025-07-04T15:41:35Z"
    569       },
    570       {
    571         "hn_id": "43852518",
    572         "title": "TSP Accelerator Powered by SOT-MRAMs and Hierarchical Clustering",
    573         "points": 1,
    574         "comments": 0,
    575         "url": "https://news.ycombinator.com/item?id=43852518",
    576         "created_at": "2025-05-01T00:56:16Z"
    577       }
    578     ],
    579     "top_points": 60,
    580     "total_points": 71,
    581     "total_comments": 28
    582   }
    583 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs