scan-v5.json (26684B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LAAG-RV: LLM Assisted Assertion Generation for RTL Design Verification", 6 "authors": [ 7 "Karthik Maddala", 8 "Bhabesh Mali", 9 "Chandan Karfa" 10 ], 11 "year": 2024, 12 "venue": "2024 IEEE 8th International Test Conference India", 13 "arxiv_id": "2409.15281", 14 "doi": "10.1109/ITCIndia62949.2024.10651860" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims about iterative prompting enabling correct SVA generation are demonstrated through examples (Assertions 1–2 refinement). Claim about efficiency is less clearly supported—no timing baselines provided.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": false, 27 "justification": "Paper claims the 'one-time Verilog loop' improves assertion quality and reduces iterations, but no ablation study isolates its contribution. Comparison to ChIRAAG (Fig 3) is correlational, not causal.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "All evaluation is on OpenTitan designs. No discussion of applicability to other HDLs, RTL tools, or LLMs. Results may not generalize beyond this domain.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "Paper attributes failures to 'timing issues' and 'missing signals' but does not explore alternative explanations (e.g., insufficient prompt quality, domain knowledge gaps, or LLM knowledge limitations).", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "Claims 'efficient and less error-prone' but proxy measures are prompt count and simulation pass/fail. Correctness is well-measured; efficiency is vaguely defined relative to ChIRAAG.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "No dedicated Limitations or Threats to Validity section. Brief caveats scattered: 'LLM-generated assertions still require manual verification' and 'not guaranteed that assertions generated are enough' but no systematic analysis.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "Only boilerplate statements ('may not generalize,' 'more assertions may be required'). No specific threat analysis of sample size (n=6), training data overlap, or reproducibility.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "Domain scope (OpenTitan) is implicit but not explicitly stated. No discussion of what the work does NOT cover (e.g., other HDLs, other LLMs, different assertion styles).", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding acknowledgment or statement visible. Authors affiliated with IIT Guwahati, but no disclosure of whether work was funded or supported by industry.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All authors list IIT Guwahati affiliation. No disclosed industry involvement, consulting relationships, or affiliation with companies benefiting from the work.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funder identified, so criterion does not apply. If work is unfunded, this should be stated explicitly.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement, patent disclosures, or financial interest declarations included.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms defined: SVA (SystemVerilog Assertions), RTL (Register Transfer Level), ABV (Assertion Based Verification), LLM, FPV. Definitions provided in Introduction with examples.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Three contributions explicitly listed: (1) framework for LLM assertion generation, (2) manual prompting strategy, (3) evaluation on OpenTitan designs. Clear and stated upfront.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Related works section discusses LLM-driven assertion generation, prior approaches using Codex, and compares directly with ChIRAAG. Shows how the one-time Verilog loop differs from JSON-based prior work.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "No source code repository, GitHub link, or release mentioned. Custom GPT-4 environment not released. Framework not publicly available.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "Uses public OpenTitan repository (https://opentitan.org/) for all test designs. Standard, publicly available benchmarks used unmodified.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "Synopsys VCS 2021.09 and GPT-4 with Code-Interpreter specified, but no requirements.txt, Dockerfile, dependency list, or environmental setup instructions provided.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "Methodology described in Section III but insufficient for reproduction: actual prompts not provided, custom GPT-4 setup not documented, iterative refinement process is qualitative.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "Table I and Fig 3 show raw counts but no error bars, confidence intervals, or variance bounds. No indication of variability or spread.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "Comparison of LAAG-RV vs ChIRAAG (Table I, Fig 3) includes no statistical significance tests, p-values, or hypothesis tests.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "Only raw counts reported (assertions generated, prompts needed). No effect sizes, percentage improvements, or normalized metrics provided.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "Evaluation on n=6 OpenTitan designs. No power analysis, sample size calculation, or justification for selection of these six modules.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "No variance across multiple runs of the same design reported. Each design appears tested once with varying iteration counts, but aggregated variance or uncertainty not quantified.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Two baselines: (1) OpenTitan hand-written assertions compared in Table I, (2) ChIRAAG framework compared in Table I and Fig 3.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "ChIRAAG cited as [26] from 2024, contemporary with this 2024 submission. OpenTitan is current, actively maintained project.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": false, 192 "justification": "No ablation of the 'one-time Verilog loop' or domain knowledge injection. Comparison to ChIRAAG is not a controlled ablation—both methods differ in multiple ways.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Multiple metrics tracked: number of assertions generated, number of prompts required, simulation time, test case pass/fail, and assertion overlap with prior work.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": false, 204 "justification": "No human study evaluating whether generated assertions are preferred over hand-written, or whether verification engineers find the tool useful or time-saving.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": false, 210 "justification": "All six OpenTitan designs appear used for both development and evaluation. No held-out test set or cross-validation employed.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results broken down per-module (Table I): RV Timer, PattGen, GPIO, ROM_Ctrl, sram_ctrl, adc_ctrl. Per-design analysis provided.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Failures explicitly discussed: Assertion 1 timing error, Assertion 2 requiring three iterations, missing signal errors. Specific examples of failures and corrections shown.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Stated upfront: 'Initial observations show that some generated assertions contain issues and did not pass all the test cases.' Acknowledgment that completeness not guaranteed.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "Only 'custom GPT4 environment' specified, without version (GPT-4, GPT-4 Turbo, etc.), snapshot date, or fine-tuning details. Marketing name without technical specifics.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": false, 242 "justification": "Prompting strategy described qualitatively ('Design-Specific Prompts,' 'Error-Specific Prompts') but no actual prompt text, templates, or fill values provided.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "No temperature, top-p, max_tokens (for inference), or other sampling hyperparameters reported. Context window capacity mentioned but not inference settings.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "Scaffolding steps documented in Section III and Fig 1: specification input → initial prompting → one-time Verilog loop for synchronization → iterative error-driven refinement.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": false, 260 "justification": "States 'focused on basic understandable details, excluding registers, Verilog implementation' but does not specify what was extracted, filtered, or excluded in detail.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "OpenTitan designs are public, but all LLM-generated SVA, test cases, and intermediate outputs (prompts, error logs) are not released.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": false, 274 "justification": "States 'various designs from the OpenTitan repository' but no description of selection criteria, whether random or convenience sample, or why these six modules.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "N/A—no human participants in the study.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "Pipeline documented in Section III and Fig 1: design selection → specification extraction → LLM generation → simulation testing → iterative refinement. Steps clear.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Custom GPT-4 training cutoff not stated. Base GPT-4 cutoff not disclosed, making it unclear whether OpenTitan (public, widely known) was in training data.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "No discussion of whether OpenTitan designs or assertions appear in GPT-4's training corpus. Major risk given OpenTitan is a well-known public project.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "OpenTitan is not a traditional benchmark but is a public source artifact. No analysis of whether LLM was trained on or exposed to these specific designs.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "N/A—no human participants.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "N/A—no human participants.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "N/A—no human participants.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "N/A—no human participants.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "N/A—no human participants.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "N/A—no human participants.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "N/A—no human participants.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "No OpenAI API cost, number of API calls, or cost-per-design reported despite using GPT-4 multiple times per design in iterative loops.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "Total computational budget not stated. Synopsys VCS licensing, OpenAI API quota, and total compute cost not disclosed.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Custom GPT-4 with domain knowledge can generate correct SystemVerilog assertions from natural language specifications through iterative simulator-driven prompting", 373 "evidence": "Assertions 1 and 2 examples showing initial failures and correction after error feedback; Table I shows 6 designs with successfully refined assertions.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "A one-time Verilog loop for signal synchronization reduces the number of prompts required compared to JSON-based approaches", 378 "evidence": "Fig 3 comparison with ChIRAAG shows LAAG-RV requires fewer prompts on average (3.5 vs 5.7 prompts per design).", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "LLM-generated assertions can exceed coverage of hand-written assertions in both quantity and uncovered design aspects", 383 "evidence": "Table I shows LAAG-RV generating 7–14 assertions vs 0–6 in OpenTitan reference; qualitative examples (Assertions 8–10) claim to cover design state transitions not in original.", 384 "supported": "weak" 385 }, 386 { 387 "claim": "Manual error prompting with simulator logs enables LLM self-correction of assertion errors within 1–3 iterations", 388 "evidence": "Assertion 1 fixed in one iteration; Assertion 2 required three iterations; multiple examples of error identification and correction shown.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "LLM-generated assertions are functionally equivalent to hand-written assertions despite syntactic differences", 393 "evidence": "Assertions 11–12 comparison showing different SVA syntax (disable iff vs inline reset check) achieving same functional goal.", 394 "supported": "weak" 395 } 396 ], 397 "methodology_tags": [ 398 "case-study", 399 "empirical" 400 ], 401 "key_findings": "A custom GPT-4 environment with domain knowledge can generate SystemVerilog assertions from natural language specifications. Initial assertions frequently contain syntax and timing errors, but a feedback loop using simulator error logs enables iterative refinement to functional correctness within 1–3 prompting cycles per assertion. The one-time Verilog loop for signal synchronization reduces required prompts compared to prior JSON-based approaches. LLM-generated assertions exceed the count and coverage of hand-written OpenTitan references, though completeness is not guaranteed and manual verification remains necessary.", 402 "red_flags": [ 403 { 404 "flag": "Training data contamination risk", 405 "detail": "OpenTitan is a public, widely-known open-source project likely in GPT-4's training corpus. No analysis of potential train/test overlap; evaluating the LLM on its own training data would inflate results." 406 }, 407 { 408 "flag": "No reproducibility", 409 "detail": "Custom GPT-4 environment not released; actual prompts not provided; no code repository; custom domain knowledge injection not documented or reproducible by others." 410 }, 411 { 412 "flag": "Weak statistical rigor", 413 "detail": "n=6 designs; no confidence intervals, error bars, or significance tests; no power analysis or sample size justification; no variance quantification across runs." 414 }, 415 { 416 "flag": "Unspecified model configuration", 417 "detail": "GPT-4 version/snapshot date not stated; inference hyperparameters (temperature, top-p, max_tokens) not reported; context window capacity mentioned but not inference settings." 418 }, 419 { 420 "flag": "No cost analysis", 421 "detail": "Framework requires repeated OpenAI API calls per design but no cost-per-design, total API cost, or ROI compared to manual assertion writing disclosed." 422 }, 423 { 424 "flag": "Missing scope boundaries", 425 "detail": "Results only shown on OpenTitan designs; no discussion of generalization to other HDLs, RTL tools, or other LLM models; domain specificity not established." 426 }, 427 { 428 "flag": "No human evaluation", 429 "detail": "No user study with verification engineers; no measurement of actual time savings, usability, or preference over hand-written assertions." 430 }, 431 { 432 "flag": "Incomplete coverage not addressed", 433 "detail": "Paper explicitly states 'not guaranteed that the assertions generated are enough to cover all the design aspects' but does not quantify coverage or discuss coverage completeness metrics." 434 }, 435 { 436 "flag": "No ablation study", 437 "detail": "One-time Verilog loop claimed as key improvement but no controlled comparison; domain knowledge injection effect not isolated." 438 }, 439 { 440 "flag": "No limitations section", 441 "detail": "No dedicated Limitations or Threats to Validity section; caveats scattered informally; no systematic threat analysis." 442 } 443 ], 444 "cited_papers": [ 445 { 446 "title": "A survey of large language models", 447 "authors": "Zhao et al.", 448 "relevance": "Foundational overview of LLM architectures and capabilities; directly relevant to methodology and model choice." 449 }, 450 { 451 "title": "Evaluating large language models trained on code", 452 "authors": "Chen et al.", 453 "relevance": "Evaluation framework for code-generating LLMs; relevant to assessing assertion generation quality." 454 }, 455 { 456 "title": "Using LLMs to facilitate formal verification of RTL", 457 "authors": "Orenes-Vera", 458 "relevance": "Prior work on LLM-assisted hardware verification; directly related to this paper's domain." 459 }, 460 { 461 "title": "LLM-assisted generation of hardware assertions", 462 "authors": "Kande et al.", 463 "relevance": "Contemporaneous work on LLM assertion generation; establishes prior art." 464 }, 465 { 466 "title": "Automated generation of security assertions for RTL models", 467 "authors": "Witharana et al.", 468 "relevance": "Security-focused assertion generation; related method for formal property verification." 469 }, 470 { 471 "title": "ChIRAAG: Chatgpt informed rapid and automated assertion generation", 472 "authors": "Mali et al.", 473 "relevance": "Direct baseline comparison; uses JSON-structured prompts vs LAAG-RV's Verilog loop approach." 474 } 475 ], 476 "engagement_factors": { 477 "practical_relevance": { 478 "score": 2, 479 "justification": "RTL verification is a real production problem; tool addresses genuine pain point. However, requires expensive GPT-4 API access, manual prompting expertise, and post-generation verification—limiting practical deployment." 480 }, 481 "surprise_contrarian": { 482 "score": 1, 483 "justification": "LLMs generating code and assertions was well-established by 2024. Iterative error-driven refinement is expected behavior. No surprising findings or contrarian claims about LLM limitations." 484 }, 485 "fear_safety": { 486 "score": 1, 487 "justification": "Using LLMs in safety-critical hardware verification raises correctness concerns, but paper does not frame or explore this as a safety risk. Treats verification failures as engineering iteration rather than safety issue." 488 }, 489 "demo_ability": { 490 "score": 2, 491 "justification": "Can demonstrate LLM generating assertions and being debugged via simulator feedback. However, requires GPT-4 API access and Synopsys VCS (commercial tools), limiting public reproducibility." 492 }, 493 "brand_recognition": { 494 "score": 1, 495 "justification": "IIT Guwahati is academically respectable but not a top-tier AI research lab. Uses OpenAI GPT-4 (recognizable) but no novel LLM contribution." 496 }, 497 "drama_conflict": { 498 "score": 1, 499 "justification": "Iterative LLM debugging is technically interesting but lacks controversy, conflict, or surprising failure modes. No dramatic claims or unsolved tensions." 500 } 501 }, 502 "hn_data": { 503 "threads": [ 504 { 505 "hn_id": "41105779", 506 "title": "Diffusion Training from Scratch on a Micro-Budget", 507 "points": 208, 508 "comments": 27, 509 "url": "https://news.ycombinator.com/item?id=41105779" 510 }, 511 { 512 "hn_id": "46140475", 513 "title": "Ragas: Automated Evaluation of Retrieval Augmented Generation", 514 "points": 4, 515 "comments": 0, 516 "url": "https://news.ycombinator.com/item?id=46140475" 517 }, 518 { 519 "hn_id": "45300655", 520 "title": "Generalizable Geometric Image Caption Synthesis", 521 "points": 3, 522 "comments": 0, 523 "url": "https://news.ycombinator.com/item?id=45300655" 524 }, 525 { 526 "hn_id": "41099652", 527 "title": "Stretching Each Dollar: Diffusion Training from Scratch on a Micro-Budget", 528 "points": 3, 529 "comments": 0, 530 "url": "https://news.ycombinator.com/item?id=41099652" 531 }, 532 { 533 "hn_id": "39276859", 534 "title": "Unlearning Reveals the Influential Training Data of Language Models", 535 "points": 3, 536 "comments": 0, 537 "url": "https://news.ycombinator.com/item?id=39276859" 538 }, 539 { 540 "hn_id": "39253748", 541 "title": "A Comprehensive (Bottom-Up) Study on the Security of Arm Cortex-M Systems", 542 "points": 2, 543 "comments": 0, 544 "url": "https://news.ycombinator.com/item?id=39253748" 545 }, 546 { 547 "hn_id": "37792975", 548 "title": "Identifying the Risks of LM Agents with an LM-Emulated Sandbox", 549 "points": 1, 550 "comments": 0, 551 "url": "https://news.ycombinator.com/item?id=37792975" 552 }, 553 { 554 "hn_id": "37767242", 555 "title": "Subjective Face Transform Using Human First Impressions", 556 "points": 1, 557 "comments": 0, 558 "url": "https://news.ycombinator.com/item?id=37767242" 559 } 560 ], 561 "top_points": 208, 562 "total_points": 225, 563 "total_comments": 27 564 } 565 }