scan.json (29032B)
1 { 2 "paper": { 3 "title": "FVEval: Understanding Language Model Capabilities in Formal Verification of Digital Hardware", 4 "authors": [ 5 "Minwoo Kang", 6 "Mingjie Liu", 7 "Ghaith Bany Hamad", 8 "Syed Suhaib", 9 "Haoxing Ren" 10 ], 11 "year": 2024, 12 "venue": "Design, Automation and Test in Europe (DATE)", 13 "arxiv_id": "2410.23299", 14 "doi": "10.23919/DATE64628.2025.10992720" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The abstract states: 'Our benchmark and evaluation code is available at https://github.com/NVlabs/FVEval.' A working URL is provided." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The benchmark test instances (NL2SVA-Human, NL2SVA-Machine, Design2SVA) are released as part of the GitHub repository referenced in the abstract." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper mentions using vLLM for open-source model inference but provides no requirements.txt, Dockerfile, or detailed environment specification listing library versions." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No step-by-step reproduction instructions are provided in the paper. Furthermore, full reproduction requires commercial FV tools (Cadence Jasper) for functional correctness evaluation, which are not freely available." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results in Tables 1, 2, 3, 4, 5 report point estimates only. No confidence intervals, error bars, or ± notation is used." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "No statistical significance tests are performed. Model comparisons are based solely on comparing raw accuracy numbers without any hypothesis testing." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": false, 53 "justification": "Results are reported as raw accuracy values (e.g., 0.456, 0.253). The paper mentions 'an improvement of up to 11%' once (Section 4.2) but does not systematically report effect sizes with baseline context across comparisons." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "No justification for sample sizes: NL2SVA-Human has 79 assertions, NL2SVA-Machine has 300 test cases, Design2SVA has 96 per category. No power analysis or rationale for these numbers." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "No variance or standard deviation is reported. Greedy decoding results are single-run. Pass@k results use the unbiased estimator from Chen et al. but no spread measures are reported." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Multiple models are compared against each other: GPT-4o, Gemini-1.5-pro/flash, Mixtral-8x22b, Llama-3/3.1 8B and 70B variants, providing mutual baselines." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "Models evaluated include gpt-4o (May 2024), Gemini-1.5-pro/flash, Llama 3.1, all contemporary state-of-the-art models at the time of evaluation." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper systematically varies evaluation settings: zero-shot vs 3-shot prompting (Tables 1 vs 3), greedy decoding vs nucleus sampling, and pass@1 vs pass@3 vs pass@5 (Tables 2, 4, 5), showing the effect of each factor." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "Four evaluation metrics are used: syntax correctness, functional correctness (exact equivalence), partial functional correctness (implication relationship), and BLEU score." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "Evaluation is entirely automated using commercial FV tools for syntax and formal equivalence checking. No systematic human evaluation of model outputs is performed, though qualitative failure examples are discussed." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": true, 95 "justification": "The benchmark test instances are separate from any training or ICL data. The 3 hand-crafted in-context examples for NL2SVA-Machine (Figure 15) are fixed and distinct from the 300 test cases." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down by sub-benchmark (NL2SVA-Human, NL2SVA-Machine, Design2SVA) and within Design2SVA by design category (Pipeline vs FSM) in Table 5." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Detailed failure case analysis is provided in Figures 7 (NL2SVA-Human), 8 (NL2SVA-Machine), and 9 (Design2SVA), with discussion of common failure modes including SVA syntax hallucination and incorrect temporal logic formulation." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Several negative findings: BLEU scores do not correlate with functional correctness (Figure 6, correlation 0.056 and 0.093); in-context examples can serve as 'distractors' for smaller models (Section 4.3, Figure 8); functional correctness remains low even for best models." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims FVEval is 'the first comprehensive benchmark' for LLM evaluation in FV (supported by the literature review in Section 5) and that 'a wide range of existing LLMs' are evaluated (8 models across proprietary and open-source, confirmed in Section 4.1)." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper makes limited causal claims. The claim that in-context examples 'can aid models' (Section 4.3) is supported by controlled comparisons (same model, same test set, zero-shot vs 3-shot). The claim that sampling improves accuracy is supported by pass@1 vs pass@k comparisons." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": true, 127 "justification": "Claims are bounded to SystemVerilog assertions and hardware formal verification. The title specifies 'Formal Verification of Digital Hardware.' Section 6 explicitly notes limitations to current design types and evaluation methods." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for observed performance differences between models. No consideration of whether training data composition, architecture differences, or other confounds could explain the results." 133 }, 134 "proxy_outcome_distinction": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper measures SVA syntax correctness and formal functional equivalence, which directly correspond to the claimed capability (generating correct formal assertions). Section 3 explicitly describes what each sub-benchmark measures and why. The proxy closely matches the claim." 138 } 139 }, 140 "setup_transparency": { 141 "model_versions_specified": { 142 "applies": true, 143 "answer": true, 144 "justification": "Section 4.1 specifies exact versions: 'gpt-4o-0513', 'gemini-1.5-pro-001', 'gemini-1.5-flash-001', 'mixtral-8x22b-instruct-v1', and specific Llama model sizes (8B, 70B) with family versions (3 and 3.1)." 145 }, 146 "prompts_provided": { 147 "applies": true, 148 "answer": true, 149 "justification": "Full prompt text is provided in the appendix: Figure 12 (NL2SVA-Human), Figure 14 (NL2SVA-Machine), Figure 15 (ICL examples), and Figure 21 (Design2SVA), including system and user messages." 150 }, 151 "hyperparameters_reported": { 152 "applies": true, 153 "answer": true, 154 "justification": "Hyperparameters are reported: greedy decoding for main experiments, T=1.0 for NL description generation, T=0.0 for critic assessment (Section 3.3), nucleus sampling with top_p=0.95 and t=0.8 for pass@k experiments (Section 4.4)." 155 }, 156 "scaffolding_described": { 157 "applies": false, 158 "answer": false, 159 "justification": "No agentic scaffolding is used. All evaluations use direct prompting of LLMs without tool use, retry logic, or feedback mechanisms." 160 }, 161 "data_preprocessing_documented": { 162 "applies": true, 163 "answer": true, 164 "justification": "Data generation processes are well-documented: NL2SVA-Machine's 4-step pipeline (random SVA generation → LLM NL description → LLM critic assessment → human inspection, Section 3.3), Design2SVA's parameterized RTL generation with control parameters (Section 3.4, Figure 4)." 165 } 166 }, 167 "limitations_and_scope": { 168 "limitations_section_present": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 6 is titled 'Limitations and Future Work' and provides substantive discussion of the benchmark's current constraints." 172 }, 173 "threats_to_validity_specific": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 6 identifies specific limitations: only arithmetic pipelines and FSMs as synthetic designs, no exploration of prompting schemes or structural inference methods, no tool-feedback or symbolic reasoning integration. These are specific to this study's design choices." 177 }, 178 "scope_boundaries_stated": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6 explicitly states what was not tested: different styles of design modules, advanced prompting techniques (citing [47]), structural inference methods (citing [51, 50]), and LLM-agentic frameworks with tool feedback." 182 } 183 }, 184 "data_integrity": { 185 "raw_data_available": { 186 "applies": true, 187 "answer": true, 188 "justification": "The benchmark data (test instances and evaluation code) is released at https://github.com/NVlabs/FVEval, enabling independent verification of the benchmark composition." 189 }, 190 "data_collection_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "NL2SVA-Human: expert-written testbenches covering FIFOs, arbiters, counters, RAM, FSMs (Table 6, 79 assertions). NL2SVA-Machine: 4-step automated generation (Section 3.3). Design2SVA: parameterized synthetic RTL with control parameters (Section 3.4, Figure 4)." 194 }, 195 "recruitment_methods_described": { 196 "applies": false, 197 "answer": false, 198 "justification": "No human participants are studied. The benchmark data consists of expert-written verification collateral and synthetically generated test cases, not human subject data." 199 }, 200 "data_pipeline_documented": { 201 "applies": true, 202 "answer": true, 203 "justification": "The NL2SVA-Machine pipeline documents each step: (1) random SVA generation, (2) LLM NL description generation with T=1.0, (3) LLM critic assessment with T=0.0, (4) human inspection (Section 3.3). Design2SVA documents control parameters and sweep methodology (Section 3.4)." 204 } 205 }, 206 "conflicts_of_interest": { 207 "funding_disclosed": { 208 "applies": true, 209 "answer": false, 210 "justification": "No funding sources (grants, corporate sponsors) are disclosed. The Acknowledgements thank individuals and Cadence Design Systems but do not mention specific funding." 211 }, 212 "affiliations_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Author affiliations are clearly stated: UC Berkeley (first author) and NVIDIA (four co-authors). The first author's internship at NVIDIA is noted." 216 }, 217 "funder_independent_of_outcome": { 218 "applies": true, 219 "answer": false, 220 "justification": "Four of five authors are from NVIDIA, a chip design company with direct commercial interest in LLM-driven FV automation. While the paper is a benchmark rather than a product evaluation, NVIDIA benefits from advancing this research area." 221 }, 222 "financial_interests_declared": { 223 "applies": true, 224 "answer": false, 225 "justification": "No competing interests or financial interests statement is included in the paper." 226 } 227 }, 228 "contamination": { 229 "training_cutoff_stated": { 230 "applies": true, 231 "answer": false, 232 "justification": "The paper states proprietary models were accessed in May 2024 (Section 4.1) but does not state the training data cutoff dates for any of the evaluated models." 233 }, 234 "train_test_overlap_discussed": { 235 "applies": true, 236 "answer": false, 237 "justification": "No discussion of whether NL2SVA-Human expert-written testbenches or their underlying design patterns could appear in LLM training data. The synthetic nature of NL2SVA-Machine and Design2SVA mitigates this concern but is not discussed." 238 }, 239 "benchmark_contamination_addressed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No contamination analysis is performed. While synthetic test case generation (NL2SVA-Machine, Design2SVA) inherently reduces contamination risk, the paper does not frame this as a contamination mitigation strategy or address contamination for the expert-written NL2SVA-Human subset." 243 } 244 }, 245 "human_studies": { 246 "pre_registered": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are studied. The paper evaluates LLMs on benchmark tasks." 250 }, 251 "irb_or_ethics_approval": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are studied." 255 }, 256 "demographics_reported": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are studied." 260 }, 261 "inclusion_exclusion_criteria": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are studied." 265 }, 266 "randomization_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants or experimental conditions requiring randomization." 270 }, 271 "blinding_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants or conditions requiring blinding." 275 }, 276 "attrition_reported": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 } 281 }, 282 "cost_and_practicality": { 283 "inference_cost_reported": { 284 "applies": true, 285 "answer": false, 286 "justification": "No inference cost, latency, or tokens consumed are reported despite evaluating 8 models across three benchmarks with multiple sampling configurations." 287 }, 288 "compute_budget_stated": { 289 "applies": true, 290 "answer": false, 291 "justification": "No computational budget is stated. The paper mentions using vLLM for open-source model inference but does not report GPU hours, hardware specifications, or total API costs." 292 } 293 }, 294 "experimental_rigor": { 295 "seed_sensitivity_reported": { 296 "applies": true, 297 "answer": false, 298 "justification": "No seed sensitivity analysis is performed. Greedy decoding results are single-run deterministic. Pass@k experiments use nucleus sampling but do not report sensitivity to random seeds." 299 }, 300 "number_of_runs_stated": { 301 "applies": true, 302 "answer": false, 303 "justification": "The total number of samples generated for pass@k computation is not explicitly stated. The paper references the unbiased estimator from Chen et al. [6] but does not specify n (total samples per problem)." 304 }, 305 "hyperparameter_search_budget": { 306 "applies": true, 307 "answer": false, 308 "justification": "No hyperparameter search budget is reported. Temperature and sampling parameters appear to use standard values without systematic tuning, but no search process or budget is documented." 309 }, 310 "best_config_selection_justified": { 311 "applies": true, 312 "answer": true, 313 "justification": "The paper reports results for all evaluated configurations (zero-shot, 3-shot, greedy decoding, nucleus sampling) rather than selecting only the best. Tables 1 and 3 show zero-shot and 3-shot side by side." 314 }, 315 "multiple_comparison_correction": { 316 "applies": false, 317 "answer": false, 318 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 319 }, 320 "self_comparison_bias_addressed": { 321 "applies": true, 322 "answer": false, 323 "justification": "The authors do not acknowledge potential bias from evaluating third-party models on their own benchmark. As benchmark creators, design choices could inadvertently favor or disfavor certain model architectures." 324 }, 325 "compute_budget_vs_performance": { 326 "applies": true, 327 "answer": false, 328 "justification": "No performance-compute analysis is provided despite comparing models of vastly different sizes (8B to 70B+ parameters) and architectures (dense vs MoE)." 329 }, 330 "benchmark_construct_validity": { 331 "applies": true, 332 "answer": true, 333 "justification": "Section 3.1 explicitly discusses what capabilities each sub-benchmark measures and their relevance to industrial FV workflows. Figure 6 demonstrates that BLEU is insufficient for evaluating SVA correctness, validating the need for formal equivalence checking." 334 }, 335 "scaffold_confound_addressed": { 336 "applies": false, 337 "answer": false, 338 "justification": "No scaffolding is used. All models are evaluated via direct prompting without agentic frameworks or tool augmentation." 339 } 340 }, 341 "data_leakage": { 342 "temporal_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "No discussion of temporal leakage. The NL2SVA-Human expert-written testbenches and common design patterns (FIFOs, arbiters) may exist in model training data. Synthetic generation for NL2SVA-Machine and Design2SVA mitigates this but is not discussed as a leakage prevention measure." 346 }, 347 "feature_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether evaluation setup or prompt format could leak information about expected answers." 351 }, 352 "non_independence_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No analysis of whether test cases share structural similarities that could inflate performance estimates (e.g., FIFO variants sharing common assertion patterns)." 356 }, 357 "leakage_detection_method": { 358 "applies": true, 359 "answer": false, 360 "justification": "No concrete leakage detection or prevention method is applied. The synthetic generation approach inherently reduces contamination risk but is not framed or analyzed as such." 361 } 362 } 363 }, 364 "scan_version": 2, 365 "active_modules": ["experimental_rigor", "data_leakage"], 366 "claims": [ 367 { 368 "claim": "FVEval is the first comprehensive benchmark for evaluating LLMs on tasks pertaining to hardware formal verification via SystemVerilog assertions.", 369 "evidence": "Section 5 (Related Work) reviews prior work and notes existing evaluations are 'limited to less than ~20 cases of test instances and have considered a limited variety of task settings,' while FVEval covers 79 + 300 + 192 test instances across three sub-tasks.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Proprietary models with advanced code generation capabilities (gpt-4o, Gemini) achieve the best performance on NL2SVA-Human across all metrics.", 374 "evidence": "Table 1 shows gpt-4o achieves highest functional correctness (0.456) and partial functional correctness (0.582). Gemini-1.5-flash achieves highest syntax (0.949).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "BLEU scores do not correlate with formal functional correctness for SVA assertions.", 379 "evidence": "Figure 6 shows correlation coefficients of 0.056 (GPT-4o) and 0.093 (Llama-3.1-70B) between BLEU scores and functional correctness.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Sampling multiple response candidates improves accuracy, with state-of-the-art models achieving up to 11% improvement from pass@1 to pass@5.", 384 "evidence": "Table 2 shows gpt-4o partial functional pass@5 (0.651) vs pass@1 (0.582 from Table 1), approximately a 7 percentage point improvement. Section 4.2 claims 'up to 11%' improvement.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "In-context examples can improve functional correctness but may also serve as distractors for smaller models.", 389 "evidence": "Table 3 shows improvement for gpt-4o (0.430 to 0.467 functional) and Llama-3.1-70b (0.303 to 0.457) with 3-shot, but Llama-3.1-8b degrades (0.320 to 0.267). Figure 8 provides qualitative examples.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "LLMs can identify and generate provable SVA assertions directly from design RTL without human guidance.", 394 "evidence": "Table 5 shows pass@5 functional correctness of 0.500 (gemini-1.5-pro, pipeline) and 0.900 (gpt-4o, FSM). Figure 9 shows qualitative examples of proven assertions.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "All models achieve near-perfect pass@5 syntax correctness on Design2SVA (1.000 for all models).", 399 "evidence": "Table 5 shows Syntax@5 = 1.000 for all six models on both pipeline and FSM test categories.", 400 "supported": "strong" 401 } 402 ], 403 "methodology_tags": ["benchmark-eval"], 404 "key_findings": "FVEval is the first comprehensive benchmark for evaluating LLMs on hardware formal verification, comprising three sub-tasks (NL2SVA-Human, NL2SVA-Machine, Design2SVA) with 571 total test instances. State-of-the-art proprietary models achieve at best ~46% functional correctness on assertion generation from natural language and up to 90% pass@5 on generating provable assertions from FSM design RTL, while BLEU scores show near-zero correlation with actual functional correctness. In-context examples and multiple sampling improve performance but cannot close the gap, with smaller models sometimes degraded by fixed in-context examples.", 405 "red_flags": [ 406 { 407 "flag": "NVIDIA conflict of interest", 408 "detail": "Four of five authors are from NVIDIA, a company with direct commercial interest in LLM-driven hardware verification automation. The benchmark could shape the field's evaluation criteria in ways favorable to NVIDIA's research priorities. Additionally, Cadence Design Systems contributed to the project (Acknowledgements) while their commercial tool is required for evaluation." 409 }, 410 { 411 "flag": "Commercial tool dependency limits reproducibility", 412 "detail": "Functional correctness evaluation requires Cadence Jasper, a commercial formal verification tool. Independent researchers without access to this tool cannot fully reproduce the evaluation, limiting the benchmark's accessibility despite code release." 413 }, 414 { 415 "flag": "No contamination analysis", 416 "detail": "No analysis of whether test cases (particularly the 79 expert-written NL2SVA-Human instances based on common design patterns like FIFOs and arbiters) appear in LLM training data. Common hardware verification patterns are widely available in textbooks and online resources." 417 }, 418 { 419 "flag": "No statistical rigor in model comparisons", 420 "detail": "All model comparisons are based on raw point estimates without confidence intervals, error bars, or significance tests. Performance differences of a few percentage points are discussed as meaningful without any statistical validation." 421 }, 422 { 423 "flag": "Small benchmark for NL2SVA-Human", 424 "detail": "NL2SVA-Human contains only 79 test instances across 13 testbenches and 6 module types. This small sample size limits the reliability of performance estimates and makes it difficult to draw robust conclusions about model capabilities on expert-written specifications." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "GPT-4 Technical Report", 430 "authors": ["Josh Achiam et al."], 431 "year": 2023, 432 "arxiv_id": "2303.08774", 433 "relevance": "Foundational LLM evaluated in the benchmark; relevant to survey scope of LLM capabilities." 434 }, 435 { 436 "title": "Evaluating Large Language Models Trained on Code", 437 "authors": ["Mark Chen et al."], 438 "year": 2021, 439 "arxiv_id": "2107.03374", 440 "relevance": "Introduces HumanEval and pass@k metric used in this work; foundational code generation benchmark." 441 }, 442 { 443 "title": "ChipNeMo: Domain-Adapted LLMs for Chip Design", 444 "authors": ["Mingjie Liu et al."], 445 "year": 2023, 446 "arxiv_id": "2311.00176", 447 "relevance": "Domain-adapted LLMs for chip design including verification; directly relevant to AI-assisted code generation in specialized domains." 448 }, 449 { 450 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 451 "authors": ["Carlos E. Jimenez et al."], 452 "year": 2023, 453 "arxiv_id": "2310.06770", 454 "relevance": "Major benchmark for LLM software engineering capabilities; methodological parallel to FVEval's benchmark design approach." 455 }, 456 { 457 "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation", 458 "authors": ["Mingjie Liu et al."], 459 "year": 2023, 460 "relevance": "Closest prior benchmark for LLM hardware code generation; FVEval extends this to formal verification specifically." 461 }, 462 { 463 "title": "VeriGen: A Large Language Model for Verilog Code Generation", 464 "authors": ["Shailja Thakur et al."], 465 "year": 2023, 466 "arxiv_id": "2308.00708", 467 "relevance": "Fine-tuned LLMs for Verilog generation; relevant to AI-assisted hardware code generation." 468 }, 469 { 470 "title": "AssertLLM: Generating and Evaluating Hardware Verification Assertions from Design Specifications via Multi-LLMs", 471 "authors": ["Wenji Fang et al."], 472 "year": 2024, 473 "arxiv_id": "2402.00386", 474 "relevance": "Prior work on LLM-based formal assertion generation from specifications; direct predecessor to FVEval." 475 }, 476 { 477 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 478 "authors": ["Jason Wei et al."], 479 "year": 2022, 480 "arxiv_id": "2201.11903", 481 "relevance": "Foundational prompting technique; cited as potential improvement avenue for LLM formal verification capabilities." 482 }, 483 { 484 "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters", 485 "authors": ["Charlie Snell et al."], 486 "year": 2024, 487 "arxiv_id": "2408.03314", 488 "relevance": "Cited as evidence that inference-time compute scaling could improve LLM assertion generation, relevant to AI capability research." 489 }, 490 { 491 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 492 "authors": ["Shunyu Yao et al."], 493 "year": 2023, 494 "arxiv_id": "2305.10601", 495 "relevance": "Structural inference method cited as future improvement direction for LLM-based formal verification." 496 }, 497 { 498 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 499 "authors": ["Shunyu Yao et al."], 500 "year": 2023, 501 "relevance": "Agentic reasoning framework cited as potential approach for LLM-driven formal verification workflows." 502 }, 503 { 504 "title": "Mixtral of Experts", 505 "authors": ["Albert Q. Jiang et al."], 506 "year": 2024, 507 "arxiv_id": "2401.04088", 508 "relevance": "Sparse MoE architecture evaluated in the benchmark; relevant to understanding model architecture effects on code generation." 509 } 510 ] 511 }