scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32063B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Formal Verification of LLM-Generated Code from Natural Language Prompts",
      6     "authors": [
      7       "Aaron Councilman",
      8       "David Fu",
      9       "Aryan Gupta",
     10       "Chengxiao Wang",
     11       "David Grove"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2507.13290",
     16     "doi": "10.48550/arXiv.2507.13290"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 83% verification of correct code and 92% identification of incorrect code, matching the results in Section 6.3 (82.9% and 92.4% respectively).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper's causal claims are primarily about the verification system's design choices. Claims about why errors occur (e.g., knowledge base returning single values, unsupported features) are justified by detailed analysis of each failure case in Section 6.3.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper is clear that Astrogator is specific to Ansible (Section 7.2: 'While Astrogator is specific to Ansible'), and future work sections (Bash, Arduino) are presented as illustrative examples with hand-translations, not claimed as fully supported.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for the observed verification rates. For example, it does not consider whether the 83%/92% rates are artifacts of the benchmark suite's simplicity or the particular LLMs chosen.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper is careful to distinguish between its test-based ground truth (which is imperfect) and true correctness, explicitly acknowledging in Section 6.1 that tests are used because 'no other solution was feasible' and that they 'have done our best to craft strong tests.'",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 (Discussion) covers engineering effort limitations and generality limitations. Section 6.3 extensively discusses verification accuracy limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper discusses specific threats: the benchmark suite is hand-crafted and small (21 tasks), the knowledge base returns only one correct value causing false rejections, test-based ground truth may have errors, and the formal query language coverage is limited.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states scope boundaries: focused on Ansible DSL, does not support shell commands, knowledge base is hand-curated, Bash and Arduino examples are hand-translated only. Section 7.1 acknowledges significant engineering effort required.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section discloses funding from IBM-ILLINOIS Discovery Accelerator Institute (IIDAI) and NSF (Delta computing resource, ACCESS program).",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: five authors from UIUC and one from IBM Research. David Grove is from IBM Research, and the work is funded by IBM-ILLINOIS partnership.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "IBM and NSF fund the research but do not have a direct financial stake in the specific verification results. The paper does not evaluate IBM products.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is provided. Given IBM Research co-authorship and IBM-ILLINOIS funding, a competing interests declaration would be expected.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 3 formally defines 'correctness' using set-theoretic notation (user intent U, formal specification language S, semantics function). 'State Calculus', 'Formal Query Language', and 'Astrogator' are all precisely defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 enumerates four explicit contributions: (1) formalizing NL programming with correctness guarantees, (2) proposing formal query languages, (3) presenting Astrogator for Ansible, (4) evaluating on a 21-task benchmark.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 8 covers six categories of related work (Controlled Natural Languages, test-based verification, proof assistant generation, NL summaries, LLM formalization, program synthesis) and explains how Astrogator differs from each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. Astrogator's implementation is described but not released.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The benchmark suite of 21 tasks is described (NL descriptions and formal queries in Appendix A) but no downloadable dataset or archive of the 1260 generated programs is provided.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions using VMs (Debian 12.11.0, Ubuntu 24.04.2, RHEL 9.6) for testing but provides no environment specification for running Astrogator itself (no requirements.txt, Dockerfile, or dependency list).",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No reproduction instructions, README, or scripts are provided. The paper describes the methodology but does not provide step-by-step instructions for reproducing experiments.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Results are reported as raw counts and percentages (e.g., 82.9% acceptance of correct programs, 92.4% rejection of incorrect programs) with no confidence intervals or error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims GPT-4o 'performs significantly better' (51.4% vs 21.5%) but provides no statistical test to support this comparative claim.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Results are reported with baseline context: 277/334 correct programs accepted (82.9%), 856/926 incorrect programs rejected (92.4%), and per-model breakdowns with absolute counts out of 210 programs each (Table 3).",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The benchmark suite has 21 tasks with 10 generations per LLM per task (1260 total). No justification is given for why 21 benchmarks or 10 samples per task is sufficient.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance or standard deviation is reported. LLMs are nondeterministic and prompted 10 times each, but no spread measures are provided across runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "There is no comparison to any alternative verification method, testing approach, or prior verification tool. Astrogator is evaluated in isolation.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "No baselines are included at all, so contemporaneity cannot be assessed.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Astrogator has multiple components (formal query language, State Calculus, symbolic interpreter, unifier, knowledge base) but no ablation study is performed to measure individual contributions.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports both acceptance rate of correct programs (82.9%) and rejection rate of incorrect programs (92.4%), effectively measuring both precision and recall of the verifier.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation is conducted. The formal query language is designed for user review but no user study validates that users can actually understand and verify formal queries.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is not a machine learning system being trained; it is a formal verification tool. There is no training/test split to discuss.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 4 provides per-benchmark breakdowns of correct/incorrect programs accepted/rejected, and Table 3 provides per-model breakdowns of error types.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 6.3 extensively discusses both false rejections (57 correct programs rejected) and false acceptances (70 incorrect programs accepted), categorizing root causes including unsupported features, knowledge base limitations, and under-specified queries.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper honestly reports that only 26.5% of LLM-generated code is correct, that Astrogator rejects 57 correct programs and accepts 70 incorrect ones, and details the specific limitations causing these failures.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are listed by name (Deepseek Coder 6.7b, Granite 8b Code, Qwen2.5 Coder 3b, Llama 3.1 8b, Starcoder 2 15b, GPT-4o) but no specific version strings or snapshot dates are given for GPT-4o.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The full code-generation prompt is provided in Appendix B.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or other generation hyperparameters are reported for any of the 6 LLMs used.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The LLMs are prompted directly with a single prompt to generate Ansible playbooks.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 6.2 describes post-processing of generated code: extracting key elements and inserting into a template, with counts of failures (42 syntax errors, 5 non-terminating programs).",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The 1260 generated programs, test scripts, VM configurations, and Astrogator source code are not publicly available.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 6.1 describes benchmark construction from StackOverflow, Ansible Forum, Ansible Galaxy, and interesting variations. Section 6.2 describes code generation: 6 LLMs, 10 programs each, 21 benchmarks.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. The 'data' is LLM-generated code and a hand-crafted benchmark suite.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is documented: LLM generation → post-processing (42 syntax failures, 5 non-terminating) → VM testing (Debian, Ubuntu, RHEL) → correctness classification → verifier evaluation. Counts at each stage are provided in Section 6.2.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the 6 LLMs used to generate code.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The benchmark tasks are derived from StackOverflow and Ansible Forum posts, which are likely in the training data of the LLMs. This potential overlap is not discussed.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The benchmark is custom-built from public sources (StackOverflow, Ansible Forum, Ansible Galaxy) but no contamination analysis is performed. Since these sources are likely in LLM training data, the 26.5% correctness rate may be influenced by contamination.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section 6.3 reports: 'Astrogator took about 70 seconds to process the 1260 programs,' giving wall-clock time for the verification pipeline.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "While verification time is mentioned (70 seconds for 1260 programs), the total compute budget for LLM code generation across 6 models and 1260 programs is not stated.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "LLMs are prompted 10 times per task to get diverse outputs, but no seed sensitivity analysis or variance across generations is reported.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": true,
    380           "justification": "Section 6.2: 'we prompt each for ten programs for each problem to further increase the diversity of our program set, resulting in a total of 1,260 programs.'",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": false,
    385           "answer": false,
    386           "justification": "No hyperparameter tuning is performed for the verification system. The LLMs are used off-the-shelf.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": false,
    391           "answer": false,
    392           "justification": "No configuration selection is performed; the verifier has a fixed design without tunable parameters.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": false,
    397           "answer": false,
    398           "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors built both the verifier and the benchmark suite, including the formal queries and reference solutions. This self-evaluation bias is not acknowledged or discussed.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": false,
    409           "answer": false,
    410           "justification": "The verifier is deterministic and not compute-variable. LLM comparisons are across model size, not compute budget.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "Section 6.1 discusses benchmark design: tasks are 'common tasks based on top posts on StackOverflow and the Ansible Forum' with average 1.67 tasks per reference solution, comparable to prior work [37]. The limitations of test-based ground truth are explicitly discussed.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is used. LLMs are prompted directly without agents or tool use.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "The benchmark is derived from StackOverflow and Ansible Forum posts that predate the LLMs' training. No temporal analysis is discussed.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the prompt format or task descriptions leak information about expected solutions.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether benchmark tasks are similar to training examples. StackOverflow-derived tasks are likely well-represented in training data.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is used.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Astrogator verifies correct LLM-generated Ansible code in 82.9% of cases",
    457       "evidence": "Of 334 correct programs identified by VM test execution, Astrogator accepted 277 (Section 6.3, Table 4)",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Astrogator rejects incorrect LLM-generated Ansible code in 92.4% of cases",
    462       "evidence": "Of 926 incorrect programs, Astrogator rejected 856 (Section 6.3, Table 4)",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "All false rejections are due to implementation limitations, not fundamental approach limitations",
    467       "evidence": "Section 6.3 categorizes 57 false rejections as: 18 unsupported features (easily fixable), 39 knowledge-base single-value limitation (fixable but technical). Authors assert all are addressable.",
    468       "supported": "weak"
    469     },
    470     {
    471       "claim": "GPT-4o significantly outperforms open-source models on Ansible generation (51.4% vs ~21.5%)",
    472       "evidence": "Table 3: GPT-4o generated 108/210 correct programs vs. 15-55/210 for open-source models",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "The State Calculus approach generalizes to Bash and Arduino beyond Ansible",
    477       "evidence": "Section 7.2 shows hand-translated examples verified by Astrogator, but no automated compiler or full system is implemented for either language",
    478       "supported": "weak"
    479     },
    480     {
    481       "claim": "Formal Query Language is understandable to non-expert users without requiring specialized system knowledge",
    482       "evidence": "The design is motivated by principles (A1, A2) and examples (Table 2), but no user study validates that real users can correctly review queries",
    483       "supported": "weak"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "case-study"
    489   ],
    490   "key_findings": "Astrogator, a formal verification system for Ansible code generated by LLMs, achieves 82.9% true positive rate (correctly identifying correct programs) and 92.4% true negative rate (correctly rejecting incorrect programs) on a 21-task benchmark. All 57 false rejections are attributable to specific fixable implementation limitations (unsupported Ansible features and knowledge base constraints). Of 70 false acceptances, 68 are expected results from under-specified queries that require human review of program assumptions, not verifier errors. GPT-4o produces substantially more correct Ansible code than open-source models (51.4% vs ~21.5%).",
    491   "red_flags": [
    492     {
    493       "flag": "Tiny benchmark (21 tasks)",
    494       "detail": "The entire evaluation rests on 21 hand-crafted tasks. This is far too small to support generalizable performance estimates for a system claiming applicability to 'mission-critical' software."
    495     },
    496     {
    497       "flag": "No baseline comparison",
    498       "detail": "Astrogator is evaluated in isolation with no comparison to any existing approach (test-based checking, static analysis, other verifiers). It is impossible to assess relative merit."
    499     },
    500     {
    501       "flag": "IBM conflict of interest",
    502       "detail": "IBM Research co-authors the paper and co-funds it through IIDAI. IBM owns Red Hat, which develops Ansible — the evaluated technology. No competing interests statement is provided."
    503     },
    504     {
    505       "flag": "Benchmark contamination unaddressed",
    506       "detail": "Benchmark tasks were drawn from StackOverflow and Ansible Forum, which are almost certainly in training data for all evaluated LLMs. No discussion of whether LLM performance is inflated by memorization."
    507     },
    508     {
    509       "flag": "Formal query usability not empirically validated",
    510       "detail": "A core claim is that non-expert users can review formal queries for correctness, but no user study is conducted to validate this assumption."
    511     },
    512     {
    513       "flag": "No statistical rigor",
    514       "detail": "No confidence intervals, significance tests, or variance measures are reported for any result. With only 21 tasks and 60 samples per task, results could vary substantially under different sampling."
    515     },
    516     {
    517       "flag": "Code not released",
    518       "detail": "Astrogator's implementation is not publicly available, making independent reproduction or extension impossible."
    519     }
    520   ],
    521   "cited_papers": [
    522     {
    523       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    524       "relevance": "Foundational work on LLM code generation that establishes test-based evaluation as the standard; directly cited as context for why tests are insufficient for formal guarantees"
    525     },
    526     {
    527       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    528       "relevance": "Documents user difficulty in verifying LLM-generated code, motivating the need for formal verification assistance"
    529     },
    530     {
    531       "title": "Do Users Write More Insecure Code with AI Assistants?",
    532       "relevance": "Evidence that AI coding assistants can produce insecure code, strengthening the motivation for formal verification in critical systems"
    533     },
    534     {
    535       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    536       "relevance": "Survey evidence that bugs in LLM code are often subtle, directly cited in motivation section"
    537     },
    538     {
    539       "title": "Automated Code Generation for Information Technology Tasks in YAML through Large Language Models",
    540       "relevance": "Prior work on LLM-generated Ansible code used as comparison point for benchmark design and code generation methodology"
    541     },
    542     {
    543       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    544       "relevance": "Evidence that LLMs struggle with practical programming tasks involving large codebases, cited in motivation"
    545     },
    546     {
    547       "title": "Can Large Language Models Transform Natural Language Intent into Formal Method Postconditions?",
    548       "relevance": "Directly related prior work on LLM-based formalization of natural language intent, compared in related work section"
    549     },
    550     {
    551       "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    552       "relevance": "Related work on LLM-generated formal proofs in proof assistants, discussed as alternative approach to verified code generation"
    553     },
    554     {
    555       "title": "A Survey and Classification of Controlled Natural Languages",
    556       "relevance": "Survey of CNLs that directly underpins the design of the Ansible Formal Query Language"
    557     },
    558     {
    559       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    560       "relevance": "One of the six LLMs evaluated in the benchmark experiments"
    561     }
    562   ],
    563   "engagement_factors": {
    564     "practical_relevance": {
    565       "score": 2,
    566       "justification": "Addresses a real pain point (verifying LLM-generated infrastructure code) but requires significant engineering effort and is limited to Ansible."
    567     },
    568     "surprise_contrarian": {
    569       "score": 1,
    570       "justification": "The approach of using formal query languages as a bridge is novel but not counterintuitive; the finding that all failures are 'fixable' is self-serving."
    571     },
    572     "fear_safety": {
    573       "score": 2,
    574       "justification": "Explicitly targets safety-critical and mission-critical systems, citing risks of LLM code errors in network stacks and embedded systems controllers."
    575     },
    576     "drama_conflict": {
    577       "score": 0,
    578       "justification": "No controversy or conflict angle; the paper is a straightforward systems contribution."
    579     },
    580     "demo_ability": {
    581       "score": 1,
    582       "justification": "Astrogator exists as a working system but code is not released, so readers cannot try it themselves."
    583     },
    584     "brand_recognition": {
    585       "score": 1,
    586       "justification": "IBM Research co-authorship and UIUC affiliation provide moderate recognition, but no high-profile lab or widely-known product association."
    587     }
    588   },
    589   "hn_data": {
    590     "threads": [
    591       {
    592         "hn_id": "44268286",
    593         "title": "Geometry from Quantum Temporal Correlations",
    594         "points": 60,
    595         "comments": 27,
    596         "url": "https://news.ycombinator.com/item?id=44268286",
    597         "created_at": "2025-06-13T13:21:47Z"
    598       },
    599       {
    600         "hn_id": "43847316",
    601         "title": "EDGS: Eliminating Densification for Efficient Convergence of 3DGS",
    602         "points": 2,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=43847316",
    605         "created_at": "2025-04-30T16:16:24Z"
    606       },
    607       {
    608         "hn_id": "43844343",
    609         "title": "Let Me Grok for You: Accelerating Grokking via Embedding Transfer",
    610         "points": 2,
    611         "comments": 0,
    612         "url": "https://news.ycombinator.com/item?id=43844343",
    613         "created_at": "2025-04-30T12:38:42Z"
    614       },
    615       {
    616         "hn_id": "36321227",
    617         "title": "Correct Compilation of Semiring Contractions (2022)",
    618         "points": 2,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=36321227",
    621         "created_at": "2023-06-14T03:53:22Z"
    622       },
    623       {
    624         "hn_id": "27997501",
    625         "title": "So you want to analyze Scheme programs with Datalog?",
    626         "points": 2,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=27997501",
    629         "created_at": "2021-07-29T15:13:22Z"
    630       },
    631       {
    632         "hn_id": "43182283",
    633         "title": "Demonstrating specification gaming in reasoning models",
    634         "points": 1,
    635         "comments": 1,
    636         "url": "https://news.ycombinator.com/item?id=43182283",
    637         "created_at": "2025-02-26T09:49:49Z"
    638       },
    639       {
    640         "hn_id": "44465492",
    641         "title": "Few-Shot Learning for Industrial Time Series: Screw-Fastening Process Monitoring",
    642         "points": 1,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=44465492",
    645         "created_at": "2025-07-04T15:41:35Z"
    646       },
    647       {
    648         "hn_id": "43852518",
    649         "title": "TSP Accelerator Powered by SOT-MRAMs and Hierarchical Clustering",
    650         "points": 1,
    651         "comments": 0,
    652         "url": "https://news.ycombinator.com/item?id=43852518",
    653         "created_at": "2025-05-01T00:56:16Z"
    654       }
    655     ],
    656     "top_points": 60,
    657     "total_points": 71,
    658     "total_comments": 28
    659   }
    660 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs