scan.json (30099B)
1 { 2 "paper": { 3 "title": "(Security) Assertions by Large Language Models", 4 "authors": [ 5 "Rahul Kande", 6 "Hammond Pearce", 7 "Benjamin Tan", 8 "Brendan Dolan-Gavitt", 9 "Shailja Thakur", 10 "Ramesh Karri", 11 "Jeyavijayan Rajendran" 12 ], 13 "year": 2023, 14 "venue": "IEEE Transactions on Information Forensics and Security", 15 "arxiv_id": "2306.14027", 16 "doi": "10.1109/TIFS.2024.3372809" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "Section I.E lists 'Open-sourcing LLM-based framework and benchmarks' as a contribution, but no repository URL, GitHub link, or archive link appears anywhere in the paper text. A stated intention without a concrete link does not satisfy this criterion." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The benchmark suite is described in detail (Table I, Listings 1-5) and benchmarks derive from public sources (Hack@DAC, OpenTitan), but no download link or archive for the complete benchmark suite is provided in the paper." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "Section IV-B mentions '32-core, 2.6 GHz Intel Xeon with 512 GB of RAM running CentOS Linux release 7.9.2009' and 'Siemens Modelsim' as the simulator, but no software dependencies, library versions, Python version, or environment specification file is provided." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are included. The framework pipeline is described conceptually (Figure 1, Section III) but there are no runnable commands or a README-style guide." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Results in Table III and Figures 3-13 report only point estimates (percentages). No confidence intervals, error bars, or uncertainty measures are provided for any accuracy numbers." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper makes numerous comparative claims (e.g., 'DetailedEx, DetailedCom performed the best with 72.16% accuracy') based solely on comparing numbers without any statistical significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper reports raw accuracy percentages but does not compute formal effect sizes. Comparisons like '93.55% vs. 26.54% average' are given without standardized effect size measures." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The benchmark suite contains 10 benchmarks. No justification is given for why 10 benchmarks are sufficient, and no power analysis or discussion of whether this sample size supports the generality of claims." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "With temperature 0.4 and top_p=1, the model is deterministic. For temperature 0.9, stochastic generation is used, but no variance, standard deviation, or spread measures are reported across repeated queries. n=10 assertions per query are generated but only aggregate percentages are reported." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table V compares code-davinci-002 against three other LLMs: code-cushman-001, codegen-2b-ft, and ChatGPT (Jan 9 2023 version). Additionally, the systematic variation of prompt components serves as an internal comparison." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The LLMs compared (Codex code-davinci-002, code-cushman-001, codegen-2b-ft, ChatGPT Jan 2023) were all contemporary at the time of experimentation (2022-2023). These represented the state of the art for code generation." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "The paper systematically varies individual prompt components: example assertions (4 types), comment strings (3 types), design source codes (3 types), assertion beginnings (3 types), synonyms (2 types), temperature (2 values), and frequency penalty (3 values). Sections IV-D.1 through IV-D.7 analyze the contribution of each component." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table III reports four metrics: number of assertions generated, % generated assertions compiled, % compiled assertions simulated, and % correct simulated assertions. These capture different stages of the evaluation pipeline." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is automated through simulation against golden reference assertions. No human expert review of generated assertion quality, correctness, or security relevance is performed. The golden references are manually crafted beforehand but the evaluation itself is entirely automated." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "The best prompt configurations (Table IV) are identified and reported on the same 10 benchmarks used for all evaluations. There is no separation between a validation set for prompt engineering and a held-out test set for final reporting." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table III provides per-benchmark breakdowns for all 10 benchmarks (BM1-BM10). Figures 4-8 and 9-13 show performance broken down by prompt component type across all benchmarks." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section IV-F provides detailed analysis of incorrect assertions with specific error types categorized in Listing 9: invalid Verilog syntax, invalid variables, incorrect variable indices, additional logic after assertion, incorrect logic, unwanted logic, and incorrect timing. Multiple concrete examples are shown." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports that NoEx configurations produce <2% correct assertions, ChatGPT tends to explain rather than generate assertions (Table V), BM7 achieves only 4.46% correct simulated assertions, and 595 prompt configurations result in less than 5% accuracy. Multiple failure modes are documented." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims the authors investigate LLMs for hardware assertion generation, design an evaluation framework, and create a benchmark suite. All three are substantiated in the paper: Table III shows LLM performance, Section III describes the framework, and Table I describes the benchmarks." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper makes causal claims about prompt components affecting performance (e.g., 'DetailedCom combinations performed the best because they elaborate all the details'). The systematic factorial design (2,268 configurations varying one component at a time across a grid) provides adequate support for these component-level causal claims, functioning as a controlled ablation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title '(Security) Assertions by Large Language Models' and conclusions reference 'LLMs' generally, but the main evaluation is on a single LLM (code-davinci-002) with only limited scalability testing on three others (Table V). The benchmark has only 10 designs covering a subset of CWE types. The paper does not explicitly bound its generalization claims to these specific models and benchmarks." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper describes patterns in results (e.g., why DetailedCom works better) but does not discuss alternative explanations such as whether the LLM's success could be due to memorizing public Hack@DAC/OpenTitan code rather than genuine assertion generation capability." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section V.B explicitly acknowledges: 'it is possible a generated assertion could be a better representation of the security intent but is marked as incorrect by the simple criteria of mismatch with hand-crafted references.' They also discuss the parameterized signal width simplification in testbenches. The gap between proxy (matching golden reference) and actual outcome (security verification quality) is acknowledged." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model versions are provided: 'code-davinci-002', 'code-cushman-001', 'codegen-2b-ft', and 'ChatGPT (Jan 9 2023 version)' (Section IV-A and IV-E)." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Full prompt structures are provided in Listings 2 and 5. Listing 2 shows the prompt data file with all comment strings, example assertions, and beginning strings. Listing 5 shows a complete prompt string with all components. The template structure (Figure 2) is also detailed." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section IV-B reports: max tokens=256, stop token='endmodule', top P=1, presence penalty=0, n=10. Temperature varied as {0.4, 0.9} and frequency penalty as {0, 0.5, 1}." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The LLM is queried directly with prompts and returns completions without any multi-step agent workflow, tool use, or feedback loops." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section III-C and Table II document the assertion file generation process including four automated syntax fixes (R1-R4): removing non-ASCII characters, removing characters after 'endmodule', removing triple-quoted strings, and adding missing 'endmodule'. Section III-A documents how benchmark designs were trimmed to <100 lines." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section V 'Discussion and Limitations' contains four substantive subsections: V-A Completeness of Benchmark Suite, V-B Simulation Testbench, V-C Use-cases of our Framework, and V-D Future Work." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section V-A discusses specific threats: benchmarks representing only a subset of CWE types, comment strings based on human judgment of complexity, reference assertions being 'only one of several possible ways to capture the desired security property.' Section V-B discusses signal width parameterization assumptions and simulation timeout handling." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section V-C states: 'Our work focused on concurrent assertions in separate SVA files, keeping the design files unmodified. Generating assertions like immediate assertions that are written into the hardware designs could be future work.' Section V-A bounds the CWE coverage. Section V-C acknowledges that the framework 'as-is is not usable without some amount of security expertise.'" 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "Only aggregate statistics (percentages, distributions) are reported in tables and figures. The individual 226,800 generated assertions and their evaluation results are not available for independent verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section III-A describes the benchmark suite construction in detail: two manually crafted designs and eight from Hack@DAC and OpenTitan (Table I). Section IV-B describes the experimental setup including hardware, LLM configuration, and the systematic prompt generation process." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. The data sources are standard public benchmarks (Hack@DAC competition designs, OpenTitan open-source SoC) plus two manually crafted designs." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "Figure 1 shows the complete pipeline: benchmark suite → prompt generation → LLM query → assertion file generator (with syntax fixing) → simulator → scoreboard. Each stage is described in Sections III-A through III-E with concrete details." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Section VII: 'Our research work was partially funded by the US Office of Naval Research (ONR Award #N00014-18-1-2058). This research work is also supported in part by a gift from Intel Corporation.'" 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are listed: Texas A&M University, University of New South Wales, University of Calgary, New York University. None of the authors are affiliated with OpenAI (whose product is evaluated)." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": true, 224 "justification": "ONR (military research funder) and Intel (hardware company) fund the work. Neither is OpenAI. The paper explicitly states: 'This work does not in any way constitute an Intel endorsement of a product or supplier.' The funders do not have a direct stake in whether OpenAI's Codex succeeds at generating assertions." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper. The Intel gift is disclosed in acknowledgments but there is no explicit declaration of whether authors hold patents, equity, or other financial interests related to the findings." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "The paper states Codex was trained on 'all the open-source code on GitHub' (Section II-B) but does not state a specific training data cutoff date for any of the evaluated models." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "The benchmark designs come from Hack@DAC (public competition) and OpenTitan (open-source SoC), both publicly available on GitHub. Since Codex was trained on GitHub code, there is substantial risk these designs were in the training data. This overlap is never discussed." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "Hack@DAC designs and OpenTitan source code were publicly available before Codex's training. The paper does not address whether the LLM may have seen these specific designs or similar assertions during training, which could inflate accuracy results." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. The evaluation is entirely automated using LLM-generated assertions compared against golden references." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. The study evaluates LLM outputs on hardware benchmarks." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "The paper makes 226,800 API calls to Codex (22,680 per benchmark × 10 benchmarks) plus additional calls for three other LLMs, but reports no API costs, token consumption, or wall-clock time for inference." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Section IV-B mentions the hardware (32-core, 2.6 GHz Intel Xeon, 512 GB RAM) but does not state total compute time, simulation hours, or API expenditure for the 226,800+ queries." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "For temperature=0.4 with top_p=1, the model is deterministic. For temperature=0.9, stochastic outputs are generated, but no seed sensitivity analysis or reporting of result variation across seeds is provided." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section IV-B clearly states: 'n is set to generate 10 assertions for every query' and '2,268 prompts for each benchmark,' totaling 22,680 assertions per benchmark and 226,800 total." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": true, 312 "justification": "The full grid of configurations is explicitly stated: temperature {0.4, 0.9} × frequency penalty {0, 0.5, 1} = 6 parameter combinations per prompt string. All 2,268 configurations are reported, making the search budget fully transparent." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Table IV shows the best configurations ranked by accuracy across all benchmarks. Figure 3 shows the full distribution of all 2,268 configurations. All results are shown, not just the best, making selection transparent." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper makes numerous comparative claims across 2,268 configurations and 10 benchmarks without any statistical tests, let alone corrections for multiple comparisons." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors created both the benchmark suite (golden reference assertions, comment strings, prompt templates) and the evaluation framework. They do not acknowledge potential bias from designing both the test and evaluation criteria." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "Table V compares four LLMs of vastly different sizes (code-davinci-002 vs. codegen-2b-ft) without discussing compute differences. No performance-per-compute analysis is provided." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": true, 337 "justification": "Section V discusses validity extensively: Section V-A acknowledges benchmark may not cover all CWE types, V-B discusses testbench limitations (parameterized signal widths), and V-C acknowledges that 'it is possible a generated assertion could be a better representation of the security intent but is marked as incorrect.' The gap between their evaluation metric and the actual goal is explicitly discussed." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is used. LLMs are queried directly with prompts for single-shot completions." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "Hack@DAC designs (2019-2021) and OpenTitan code were publicly available before Codex's training. The paper does not discuss whether the LLM may have seen these exact designs or similar assertions during training." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "The GoldenDUT prompt configuration includes the correct design source code, which the LLM may have seen paired with assertions during training on GitHub. This potential feature leakage from training data is not discussed." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The 10 benchmarks include 6 from related Hack@DAC competitions and 2 from the same OpenTitan SoC. Structural similarities between benchmarks from the same source are not discussed, nor is potential non-independence between the training data and test benchmarks." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, decontamination, or temporal analysis to check if benchmark code was in the training data." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "LLMs can generate correct hardware security assertions for all 10 benchmarks, with an average accuracy of 26.54% across all prompt configurations and up to 93.55% for the best configuration.", 371 "evidence": "Table III shows correct assertions generated for all 10 benchmarks. Table IV shows the best configuration achieving 93.55%. Section IV-C reports 26.54% average accuracy.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Detailed comment strings and example assertions are the most important prompt components for generating correct assertions.", 376 "evidence": "Section IV-D.1 and Figure 4/11 show (DetailedEx, DetailedCom) achieves 72.16% accuracy. Section IV-D.7 identifies example assertions and comment strings as the dominant factors, with the top 3 combinations producing ~70% of correct assertions.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "Design source code type has minimal impact when detailed comments are provided.", 381 "evidence": "Section IV-D.2 and Figure 12 show GoldenDUT (48.79%), BuggyDUT (45.59%), and EmptyDUT (48.83%) all achieve similar accuracy with DetailedCom.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Temperature 0.9 produces higher accuracy but fewer total correct assertions compared to 0.4.", 386 "evidence": "Section IV-D.5 and Figure 13 show (temp=0.9, freq_penalty=1) at 34.25% accuracy vs. (temp=0.4, freq_penalty=1) at 25.32%. Section IV-D.5 notes temp=0.4 generated ~10% more correct assertions in total.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "code-davinci-002 outperforms other LLMs, generating correct assertions for 9/10 benchmarks vs. 7 (code-cushman-001), 4 (codegen-2b-ft), and 5 (ChatGPT).", 391 "evidence": "Table V shows per-benchmark success/failure for all four LLMs on the most contextual prompt configuration.", 392 "supported": "weak" 393 } 394 ], 395 "methodology_tags": ["benchmark-eval"], 396 "key_findings": "OpenAI's Codex (code-davinci-002) can generate correct hardware security assertions with up to 93.55% accuracy when given detailed prompts, but averages only 26.54% across all 2,268 prompt configurations and 10 benchmarks. The quality of prompt engineering — particularly detailed comment strings and example assertions — is the dominant factor, while design source code type and assertion beginnings have relatively minor impact. The framework was validated across 10 benchmarks covering multiple CWE categories, and error analysis revealed common LLM failures including invalid Verilog syntax, incorrect signal references, and timing errors.", 397 "red_flags": [ 398 { 399 "flag": "Training data contamination unaddressed", 400 "detail": "All benchmarks derive from publicly available sources (Hack@DAC, OpenTitan) that were likely in Codex's training data (trained on 'all open-source code on GitHub'). The LLM may be recalling or partially memorizing known assertion patterns rather than generating novel ones. This is never discussed." 401 }, 402 { 403 "flag": "Small benchmark suite", 404 "detail": "Only 10 benchmarks: 2 manually crafted 'toy' designs, 6 from related Hack@DAC competitions, and 2 from OpenTitan. This limited diversity may not support generalizations about LLM capability for hardware security assertions broadly." 405 }, 406 { 407 "flag": "No statistical rigor in comparisons", 408 "detail": "Thousands of comparative claims are made across 2,268 configurations without any statistical significance tests, confidence intervals, or error bars. Differences between configurations may not be statistically meaningful." 409 }, 410 { 411 "flag": "Scalability evaluation limited", 412 "detail": "The comparison with other LLMs (Table V) uses only a single prompt configuration, with binary correct/incorrect per benchmark. This provides very limited evidence for the claim about relative LLM performance." 413 }, 414 { 415 "flag": "Signal width simplification", 416 "detail": "Section V-B acknowledges that 32-bit signals were parameterized to 2-bit for simulation feasibility, assuming assertion behavior holds across data widths. This assumption is not validated." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Evaluating Large Language Models Trained on Code", 422 "authors": ["M. Chen"], 423 "year": 2021, 424 "arxiv_id": "2107.03374", 425 "relevance": "Foundational Codex evaluation paper; the primary LLM evaluated in this work." 426 }, 427 { 428 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 429 "authors": ["H. Pearce"], 430 "year": 2022, 431 "relevance": "Directly relevant study assessing security implications of LLM-generated code (Copilot)." 432 }, 433 { 434 "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models", 435 "authors": ["H. Pearce"], 436 "year": 2022, 437 "arxiv_id": "2112.02125", 438 "relevance": "Evaluates LLMs for security vulnerability repair, closely related to LLM-based security verification." 439 }, 440 { 441 "title": "DAVE: Deriving Automatically Verilog from English", 442 "authors": ["H. Pearce", "B. Tan", "R. Karri"], 443 "year": 2020, 444 "relevance": "Pioneering work on fine-tuning LLMs for hardware description language generation from natural language." 445 }, 446 { 447 "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation", 448 "authors": ["S. Thakur"], 449 "year": 2023, 450 "relevance": "Benchmark study for LLM-based hardware code generation, directly related evaluation methodology." 451 }, 452 { 453 "title": "LLM4SecHW: Leveraging Domain-Specific Large Language Models for Hardware Debugging", 454 "authors": ["W. Fu"], 455 "year": 2023, 456 "relevance": "Uses trained LLMs for hardware security bug detection and fixing, a follow-on application of LLMs in hardware security." 457 }, 458 { 459 "title": "A Conversational Paradigm for Program Synthesis", 460 "authors": ["E. Nijkamp"], 461 "year": 2022, 462 "arxiv_id": "2203.13474", 463 "relevance": "CodeGen model used as one of the baseline LLMs evaluated in this work." 464 }, 465 { 466 "title": "From RTL to SVA: LLM-assisted generation of Formal Verification Testbenches", 467 "authors": ["M. Orenes-Vera", "M. Martonosi", "D. Wentzlaff"], 468 "year": 2023, 469 "arxiv_id": "2309.09437", 470 "relevance": "Parallel work evaluating LLMs for hardware assertion generation using formal verification rather than simulation." 471 }, 472 { 473 "title": "LLM4DV: Using Large Language Models for Hardware Test Stimuli Generation", 474 "authors": ["Z. Zhang"], 475 "year": 2023, 476 "arxiv_id": "2310.04535", 477 "relevance": "Evaluates LLMs for hardware test generation, complementary to assertion generation in verification workflow." 478 }, 479 { 480 "title": "Can OpenAI Codex and Other Large Language Models Help Us Fix Security Bugs?", 481 "authors": ["H. Pearce"], 482 "year": 2022, 483 "arxiv_id": "2112.02125", 484 "relevance": "Evaluates LLM capability for security bug fixing, directly relevant to LLM-assisted security verification." 485 } 486 ] 487 }