scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27630B)
      1 {
      2   "paper": {
      3     "title": "AutoVerus: Automated Proof Generation for Rust Code",
      4     "authors": [
      5       "Chenyuan Yang",
      6       "Xuheng Li",
      7       "Md Rakib Hossain Misu",
      8       "Jianan Yao",
      9       "Weidong Cui",
     10       "Yeyun Gong",
     11       "Chris Hawblitzel",
     12       "Shuvendu Lahiri",
     13       "Jacob R. Lorch",
     14       "Shuai Lu",
     15       "Fan Yang",
     16       "Ziqiao Zhou",
     17       "Shan Lu"
     18     ],
     19     "year": 2024,
     20     "venue": "Proc. ACM Program. Lang. (OOPSLA)",
     21     "arxiv_id": "2409.13082",
     22     "doi": "10.1145/3763174"
     23   },
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The Data-Availability Statement says 'The code and artifact of AutoVerus are available at microsoft/verus-proof-synthesis.' This is a GitHub repository URL."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Verus-Bench, the benchmark suite of 150 proof tasks, is part of the released artifact at microsoft/verus-proof-synthesis. The benchmark was curated from public sources (Diffy, MBPP-DFY-153, CloverBench)."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 6.2.1 specifies hardware (Ubuntu 22.04.1 LTS, 24-core CPUs, 64 GB RAM) and the default model (GPT-4o version 2024-05-13). The tool is a Python CLI using Azure OpenAI APIs. The released artifact would contain environment details, though the paper itself does not list a requirements.txt explicitly."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper provides a Data-Availability Statement pointing to the repository microsoft/verus-proof-synthesis. The tool is described as 'a command line Python tool' (Section 5), and the evaluation methodology is described in sufficient detail in Section 6 for reproduction. The released artifact at the repository provides instructions."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No confidence intervals or error bars are reported. Results are given as counts (e.g., 137/150 tasks proved) without uncertainty quantification. The paper acknowledges LLM randomness but does not report CIs across the 3 runs."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are used despite comparative claims between AutoVerus and the baseline (137 vs 67 tasks) and across model variants. The comparisons rely solely on raw count differences."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports concrete effect sizes with baseline context: AutoVerus proves 137/150 (91.3%) vs baseline 67/150 (44.7%). Per-phase breakdowns are given (78, 22, 37 tasks). Cost is $37 total. Time comparisons show AutoVerus needs 22 seconds per task vs 458 seconds for baseline to reach the same number of proved tasks."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The benchmark size of 150 tasks is not formally justified. Section 6.1 describes how tasks were selected from existing benchmarks but does not justify whether 150 is sufficient for the claims being made. The ablation study uses a 30-task subset without power analysis."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper states results are reported 'after running AutoVerus for three times' (Section 6.2.3) and mentions 122 tasks proved on the first attempt, 10 on the second, 5 on the third. However, no standard deviation or variance across these three runs is reported. The 30-task ablation with different models also reports single aggregate numbers."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "A baseline of repeatedly invoking GPT-4o with a comprehensive prompt is included (Section 6.2.2). The baseline was deliberately made competitive: same temperature, same multi-output setting, and even given an unfair advantage of including answers to 4 tasks as examples."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The baseline is a custom repeated-prompting scheme, not a comparison against published state-of-the-art tools. While related work mentions Loopy, Laurel, DafnyBench, and others, none are directly compared experimentally. The paper argues these tools target different languages/tasks, but no head-to-head comparison with any existing LLM-for-proof system is conducted."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Extensive ablation studies are presented in Section 7.4: Phase-1/2 ablations (Table 4) test few-shot example removal, phase fusion, and ranking disabling. Phase-3 ablation compares specialized repair agents vs a generic debugging agent. LLM choice and temperature ablations are in Table 5."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Multiple metrics are reported: number of correctly verified tasks, time cost (seconds), number of LLM calls, per-phase breakdown of proved tasks, repair agent success rates, token consumption, and monetary cost ($37)."
     94       },
     95       "human_evaluation": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Human evaluation is not applicable here. Verus formal verification provides a mathematically rigorous correctness check -- proofs either pass or fail verification. This is a stronger guarantee than human evaluation."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.1 explicitly states the team designing AutoVerus did not overlap with the team setting up the majority of Verus-Bench (all MBPP and CloverBench tasks). The few-shot examples used in AutoVerus agents have no overlap with Verus-Bench. The 3 Misc tasks with potential leakage from Verus Tutorial are explicitly discussed in Section 8."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 2 breaks down results by benchmark source (CloverBench, Diffy, MBPP, Misc) and by phase. Table 3 provides per-agent statistics. Figure 11 breaks down repair agent invocations by LLM model. Table 4 and 5 provide detailed ablation breakdowns."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 7.5 ('Failed cases') categorizes the 13 failed tasks into three types: arithmetic overflow difficulties (6 tasks), insufficient knowledge of vstd library (5 tasks), and complicated logic (2 tasks). Specific task examples (MBPP-170, Misc-deduplicate, MBPP-755) are discussed with detailed explanations."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that the fused Phase-1/2 design performs worse than separate agents (7 fewer tasks proved, Table 4). The generic debugging agent proves only 17/37 tasks vs AutoVerus's specialized agents. GPT-3.5-turbo proves only 18/30 tasks. Lower temperatures reduce performance."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 'more than 90% of them' proved -- Table 2 shows 137/150 = 91.3%. The abstract claims 'more than half of them tackled in less than 30 seconds or 3 LLM calls' -- Section 7.2 and Figure 8 support this. All abstract claims are backed by specific results."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims are made through ablation studies: removing components (few-shot examples, ranking, individual phases) and measuring the effect on proved tasks. These are controlled single-variable manipulations. The ablation design in Tables 4 and 5 provides adequate evidence for 'component X contributes to performance' claims."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 8 explicitly discusses three key limitations for scaling: code dependencies in large systems, specification complexity, and more diverse proof problems. The paper states 'it is still just a collection of 150 tasks. It cannot represent many real-world proof tasks in large system code.' The title specifies 'Rust Code' and the work is clearly scoped to Verus."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 8 discusses data leakage as an alternative explanation for good performance (LLMs may have seen proof tasks in training data) and provides evidence against it: the baseline LLM cannot prove the 3 potentially leaked tasks, and AutoVerus's proof differs from the online tutorial version. LLM randomness is acknowledged."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 6.2.1 specifies 'GPT-4o (model version 2024-05-13).' Section 8 also mentions 'earlier GPT-4o models (with 2024-02-15 and 2024-05-13 versions).' GPT-4-turbo, GPT-3.5-turbo, and Deepseek-R1-32B are also named. The GPT-4o version with specific date qualifies."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper describes prompt content in natural language (e.g., 'we ask it to add loop invariants to the given Rust code' in Section 4.1.1, repair strategy instructions in Sections 4.2 and 4.3) but does not provide the actual prompt text used. The few-shot examples are described ('three toy Verus examples') but not shown in full. The released code may contain prompts, but the paper itself does not."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Temperature is set to 1.0 (Section 6.2.2, with ablation at 0.1, 0.4, 0.7 in Table 5). Multi-output is set to 5 for Phase 1 and repair agents (3 otherwise). Maximum debugging iterations is 10 (Section 4.3.2). Threshold for complex assert-repair agent is 3 attempts. These are clearly reported."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The agentic scaffolding is described in extensive detail: three-phase workflow (Generation, Refinement, Debugging) in Section 4 with Algorithms 1-3, agent composition, error selection priority, repair acceptance criteria, Lynette for safety checking and merging, Houdini algorithm integration. The agent network is shown in Figure 2 and repair agent list in Figure 6."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6.1 documents the benchmark construction: 153 MBPP-DFY problems -> 67 trivial removed, 8 untranslatable -> 78 included. CloverBench: 60 -> 39 translated -> 3 incompatible, 4 weak, 21 trivial -> 11 included. Diffy: 69 -> 31 non-linear removed -> 38 included. 23 Misc from tutorials. Filtering criteria and counts are explicit."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8 is titled 'Discussion & Threats to Validity' and provides substantive discussion of general threats, data leakage, and scaling challenges."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 8 discusses specific threats: the benchmark is only 150 tasks and cannot represent real-world system code; authors who designed AutoVerus were familiar with some Diffy and Misc tasks; three specific tasks may have data leakage from the Verus Tutorial; evaluation does not cover newest Rust features like iterators."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 8 ('How to scale up?') explicitly states three things the results do NOT show: handling complex code dependencies in large systems, generating specifications (only proof annotations), and handling the diversity of proof problems in real verified systems. The paper also states 'a single Rust function is the unit of proof generation' (Section 4 footnote)."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The Verus-Bench benchmark suite and AutoVerus code are released at microsoft/verus-proof-synthesis. Since Verus verification is deterministic (given the proof, verification either passes or fails), the raw data can be independently verified."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 6.1 describes in detail how each benchmark source was collected and translated: MBPP-DFY-153 from Dafny, CloverBench from Dafny/Verus, Diffy from C, and Misc from Verus tutorials/libraries. Selection criteria and exclusion reasons are documented."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants are involved. The benchmark is constructed from existing public benchmark suites."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline from source benchmarks to Verus-Bench is documented: translation from Dafny/C to Rust/Verus, filtering of trivial tasks (88 rejected), filtering of untranslatable tasks (8 rejected), manual inspection of CloverBench translations. Section 6.1 provides counts at each stage."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding statement is provided. The Acknowledgment section only mentions 'anonymous reviewers.' Given that 8 of 13 authors are affiliated with Microsoft Research and the tool uses Azure OpenAI APIs, funding should be disclosed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: 8 authors are at Microsoft Research (including Microsoft Research Asia), and others are at various universities. The Microsoft connection is transparent."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Microsoft Research employees evaluate a tool built using Microsoft's Azure OpenAI APIs, for Microsoft's Verus verification tool. Microsoft has a commercial interest in demonstrating the value of its AI and verification tools. The funder is not independent of the outcome."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement is present in the paper. Given Microsoft's commercial interests in AI tools and Verus, a declaration should be included."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 8 discusses data leakage and mentions 'earlier GPT-4o models (with 2024-02-15 and 2024-05-13 versions), predating the widespread public availability of CloverBench.' While not stating an exact training cutoff date, the paper reasons about the temporal relationship between model versions and benchmark availability."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 8 ('Data leakage') extensively discusses potential train/test overlap. It analyzes which Verus-Bench sources were publicly available before model training: Diffy and MBPP proofs were never public, CloverBench was released after the GPT-4o training cutoff, and only 3 Misc tasks had potential leakage from the Verus Tutorial."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 8 provides a two-pronged argument: (1) the scarcity of Verus resources online (fewer than 10 GitHub projects) means training data is minimal, and (2) the baseline GPT-4o cannot prove the 3 potentially leaked tasks without AutoVerus, suggesting LLMs did not memorize them. AutoVerus's proof differs from the tutorial version."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 7.1 reports: 'This entire process consumed 3 million input and 1.5 million output tokens. Based on the Azure GPT-4o pricing... the total cost for the benchmark suite was approximately $37.' Average 8.8 LLM calls per task. Time costs are shown in Figure 8a (per-task time budgets)."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Total cost of $37 for the benchmark suite is stated. Hardware is specified (Ubuntu 22.04.1, 24-core CPUs, 64 GB RAM). Average time per task is reported (70.0 seconds for GPT-4o, 229.8 seconds for GPT-4-turbo). Token consumption is quantified."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "AutoVerus can automatically generate correct proof for more than 90% of the 150 Verus-Bench tasks.",
    301       "evidence": "Table 2 shows 137 out of 150 tasks (91.3%) proved. Results are from running AutoVerus up to 3 times per task (Section 7.1).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "AutoVerus proves roughly twice as many tasks as a competitive baseline under the same time and LLM-call budget.",
    306       "evidence": "Table 2 shows 137 vs 67 tasks. Figure 8 shows AutoVerus consistently proves about 2x more tasks at every budget level. The baseline had 10 minutes per task and included answer examples for 4 tasks (Section 7.2).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "With a tight budget of 30 seconds or 3 LLM calls, AutoVerus can prove more than half of the Verus-Bench tasks.",
    311       "evidence": "Section 7.2 states: 'if we give each task only 30 seconds, AutoVerus can generate correct proof for 81 tasks.' Figure 8 confirms this for both time and LLM-call budgets.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "The AutoVerus framework works well with different LLM models (GPT-4o, GPT-4-turbo, Deepseek-R1).",
    316       "evidence": "Table 5 shows GPT-4o and GPT-4-turbo both prove 26/30 tasks, Deepseek-R1 proves 23/30. However, GPT-4-turbo takes 229.8s vs 70.0s per task (Section 7.4.3).",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "Each phase of AutoVerus contributes meaningfully to the overall performance.",
    321       "evidence": "Table 2: Phase-1 proves 78, Phase-2 adds 22, Phase-3 adds 37. Ablation in Table 4 shows fusion of phases hurts (7 fewer tasks). Section 7.4.2 shows generic debugging agent proves only 17/37 tasks vs specialized agents.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "Merging proof candidates from multiple LLM outputs improves proof success.",
    326       "evidence": "Section 7.3.1: 'For 14 tasks, although none of the direct output of the Phase-1 agent is correct, merging some of them immediately provides correct proof.' Example in Figure 5.",
    327       "supported": "strong"
    328     }
    329   ],
    330   "methodology_tags": [
    331     "benchmark-eval"
    332   ],
    333   "key_findings": "AutoVerus, a multi-agent LLM system for automated proof generation in Verus (a Rust verification tool), successfully proves 137 of 150 benchmark tasks (91.3%), compared to 67 (44.7%) by a competitive baseline of repeatedly prompting GPT-4o. The system achieves this through a three-phase approach: preliminary proof generation, generic refinement, and error-driven debugging, each using specialized LLM agents. The tool is efficient, proving over half the tasks in under 30 seconds at a total cost of $37 for the full benchmark suite, and the framework generalizes across different LLM models.",
    334   "red_flags": [
    335     {
    336       "flag": "Corporate self-evaluation",
    337       "detail": "8 of 13 authors are Microsoft Research employees. AutoVerus uses Azure OpenAI APIs and targets Verus, which was developed by Microsoft Research. The paper essentially evaluates Microsoft tools using Microsoft infrastructure. No competing interests statement is provided."
    338     },
    339     {
    340       "flag": "Weak baseline design",
    341       "detail": "The sole baseline is a custom repeated-prompting scheme rather than any existing published tool for LLM-based proof generation (e.g., Loopy, DafnyBench). While the paper argues these target different languages, the absence of any head-to-head comparison with published systems makes it difficult to contextualize the results."
    342     },
    343     {
    344       "flag": "No variance across runs",
    345       "detail": "Despite acknowledging LLM randomness, the paper reports results from 3 runs without reporting variance or standard deviation. The claim of '137 tasks' appears to be a maximum across runs rather than an average with uncertainty bounds."
    346     },
    347     {
    348       "flag": "Partial benchmark designer overlap",
    349       "detail": "Section 8 acknowledges that 'the authors who participated in the design of AutoVerus are already familiar with some of the tasks in the Diffy benchmark and in the Misc set,' creating a potential for unintentional tuning to these tasks."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "Towards AI-Assisted Synthesis of Verified Dafny Methods",
    355       "authors": ["Md Rakib Hossain Misu", "Cristina V. Lopes", "Iris Ma", "James Noble"],
    356       "year": 2024,
    357       "doi": "10.1145/3643763",
    358       "relevance": "Evaluates LLM capability for generating verified code in Dafny, directly relevant to GenAI for formal verification."
    359     },
    360     {
    361       "title": "DafnyBench: A Benchmark for Formal Software Verification",
    362       "authors": ["Chloe Loughridge", "Qinyi Sun", "Seth Ahrenbach"],
    363       "year": 2024,
    364       "arxiv_id": "2406.08467",
    365       "relevance": "Creates a benchmark for LLM-based proof synthesis in Dafny, comparable evaluation methodology to AutoVerus."
    366     },
    367     {
    368       "title": "Clover: Closed-Loop Verifiable Code Generation",
    369       "authors": ["Chuyue Sun", "Ying Sheng", "Oded Padon", "Clark W. Barrett"],
    370       "year": 2024,
    371       "relevance": "LLM-based system for generating verified Dafny programs from natural language, one of the benchmark sources for Verus-Bench."
    372     },
    373     {
    374       "title": "LeanDojo: Theorem Proving with Retrieval-Augmented Language Models",
    375       "authors": ["Kaiyu Yang", "Aidan M. Swope", "Alex Gu"],
    376       "year": 2023,
    377       "relevance": "Uses LLMs with retrieval augmentation for theorem proving in LEAN, relevant to AI-assisted formal verification."
    378     },
    379     {
    380       "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    381       "authors": ["Emily First", "Markus N. Rabe", "Talia Ringer", "Yuriy Brun"],
    382       "year": 2023,
    383       "doi": "10.1145/3611643.3616243",
    384       "relevance": "LLM-based proof generation and repair for Isabelle, directly comparable approach to AutoVerus for a different verification framework."
    385     },
    386     {
    387       "title": "Finding Inductive Loop Invariants using Large Language Models",
    388       "authors": ["Adharsh Kamath", "Aditya Senthilnathan", "Saikat Chakraborty"],
    389       "year": 2023,
    390       "arxiv_id": "2311.07948",
    391       "relevance": "Uses LLMs and Houdini algorithm for loop invariant generation in C programs (Loopy), shares core techniques with AutoVerus."
    392     },
    393     {
    394       "title": "Lemur: Integrating Large Language Models in Automated Program Verification",
    395       "authors": ["Haoze Wu", "Clark W. Barrett", "Nina Narodytska"],
    396       "year": 2024,
    397       "relevance": "Integrates LLMs with automated reasoning for program verification, similar high-level approach to combining AI with formal methods."
    398     },
    399     {
    400       "title": "Evaluating large language models trained on code",
    401       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    402       "year": 2021,
    403       "arxiv_id": "2107.03374",
    404       "relevance": "Foundational evaluation of LLMs for code generation (Codex/HumanEval), relevant to the broader LLM code generation evaluation landscape."
    405     },
    406     {
    407       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    408       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    409       "year": 2023,
    410       "relevance": "Questions reliability of LLM-generated code, motivating the need for formal verification of AI-generated code."
    411     },
    412     {
    413       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    414       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    415       "year": 2023,
    416       "arxiv_id": "2304.00385",
    417       "relevance": "Evaluates LLM-based automated program repair with cost analysis, relevant to AI-assisted software engineering evaluation methodology."
    418     },
    419     {
    420       "title": "Towards Neural Synthesis for SMT-Assisted Proof-Oriented Programming",
    421       "authors": ["Saikat Chakraborty", "Gabriel Ebner", "Siddharth Bhat"],
    422       "year": 2024,
    423       "arxiv_id": "2405.01787",
    424       "relevance": "Uses LLMs for proof generation in F*, another proof-oriented language, directly relevant to GenAI for formal verification."
    425     },
    426     {
    427       "title": "Laurel: Generating Dafny Assertions Using Large Language Models",
    428       "authors": ["Eric Mugnier", "Emmanuel Anaya Gonzalez", "Ranjit Jhala"],
    429       "year": 2024,
    430       "arxiv_id": "2405.16792",
    431       "relevance": "LLM-based assertion generation for Dafny verification, related approach to AutoVerus's assertion repair agents."
    432     }
    433   ]
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs