scan.json (28493B)
1 { 2 "paper": { 3 "title": "LLM-Aided Testbench Generation and Bug Detection for Finite-State Machines", 4 "authors": [ 5 "Jitendra Bhandari", 6 "Johann Knechtel", 7 "Ramesh Narayanaswamy", 8 "Siddharth Garg", 9 "Ramesh Karri" 10 ], 11 "year": 2024, 12 "venue": "arXiv", 13 "arxiv_id": "2406.17132", 14 "doi": "10.48550/arXiv.2406.17132" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval", "case-study"], 19 "key_findings": "Iterative feedback from commercial EDA coverage reports enables GPT-3.5 and GPT-4 to generate testbenches achieving 90–100% FSM transition coverage, compared to 4–75% without feedback. GPT-4 consistently requires fewer iterations than GPT-3.5. For bug detection, dividing I/O traces into small chunks and handling multi-bit outputs individually allows GPT-4 to detect injected bugs across all 100 FSMs (Easy through Hard), while naive prompting fails on complex FSMs.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "Reference [16] provides a GitHub repository: https://github.com/jitendra-bhandari/LLM-Aided-Testbench-Generation-for-FSM, stated to contain RTL source code and testbenches." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The 100 FSMs are sourced from HDLBits (public) and GitHub. The paper states 'Full analysis of the datasets, including the RTL source code and the testbenches are available at [16].' FSM designs and testbenches are released via the GitHub repo." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper names Synopsys VCS U-2023.03-1, Verdi U-2023.03-1, and Python, but provides no requirements.txt, Dockerfile, or detailed environment specification listing library versions or dependencies." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided in the paper. The automation is mentioned ('We automated the framework in Python') but no commands, scripts, or README-level instructions are given." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Table I are point estimates (coverage percentages, pass/fail indicators). No confidence intervals or error bars are reported anywhere." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "Comparisons between GPT3.5 and GPT4, and between methods with and without feedback, are made solely by comparing numbers in tables. No statistical significance tests are performed." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Coverage improvements are reported with baseline context (e.g., 'coverage is 25% for both GPT3.5 and GPT4, whereas for our method it is 100%'). Table I provides both baseline and improved coverage for each FSM, enabling readers to assess effect magnitudes." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "100 FSMs are used with no justification for this number. No power analysis or rationale for why 100 is sufficient to support the claims." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs with no mention of repeated experiments." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table I compares four conditions: GPT3.5 alone, GPT4 alone, GPT3.5+This Work, GPT4+This Work. For bug detection, random fuzzing serves as a non-AI baseline." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "GPT-3.5 and GPT-4 were state-of-the-art LLMs at the time of writing. Random fuzzing is a standard hardware testing baseline." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "The paper compares multiple configurations: with/without EDA feedback, three bug detection scenarios (state registers, I/O pairs, fuzzing), and additional prompting techniques (divided I/O, multi-bit handling) as shown in Table I and Figs. 3–4." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Evaluation uses transition coverage percentage, number of iterations required, bug detection success/failure, and number of input patterns needed to detect bugs — four distinct metrics." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is fully automated via Synopsys VCS coverage reports and simulation outputs. No human evaluation of testbench quality or bug detection correctness is performed." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "All 100 FSMs are used for evaluation with no train/test split. Prompts were likely iteratively designed using the same FSMs evaluated, with no held-out set mentioned." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table I breaks down results by complexity level (Easy, Medium, Hard) and by bug detection scenario (State Regs, I/O pairs, Fuzzing). Individual FSM results are shown." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section V discusses failure modes: LLMs getting stuck in repetitive responses, inability to comprehend large I/O traces, context limits preventing full coverage, and difficulty with complex FSMs. Section IV-B also discusses limitations like the model not finding correct input patterns." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Table I shows many failure cases (✗) for GPT3.5 and GPT4 without the proposed method, especially on Medium and Hard FSMs. The naive approach to bug detection is explicitly reported as failing to scale." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims 'promising results' for enhanced test coverage and bug detection. These are hedged claims supported by Table I showing coverage improvements (25%→100% in examples) and successful bug detection across complexity levels." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The main causal claim is that EDA feedback improves coverage. The study design compares the same model with and without the feedback loop on the same FSMs, which constitutes controlled single-variable manipulation adequate for this claim." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title scopes to 'Finite-State Machines' but the abstract refers broadly to 'the domain of chip testing.' Results are on 100 FSMs from HDLBits/GitHub with up to 28 states, but the paper does not explicitly bound generalization to these FSM characteristics or acknowledge that results may not extend to industrial-scale designs." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No alternative explanations for the results are discussed. For example, the paper doesn't consider whether GPT models might have memorized HDLBits solutions, or whether the improvements are simply due to additional iterations rather than the specific feedback content." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures FSM transition coverage and bug detection success, and frames claims at the same granularity. The footnote on p.4 explicitly notes that '100% transition coverage does not guarantee bug detection,' acknowledging the proxy gap." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper uses 'GPT3.5' and 'GPT4' throughout without specific API versions, snapshot dates, or model identifiers (e.g., 'gpt-4-0613'). Model behavior changes across versions." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Figs. 3 and 4 provide the actual prompt text used: system prompt (Fig. 3a), coverage feedback prompt (Fig. 3b), bug detection prompt (Fig. 3c), and additional prompts for divided I/O and multi-bit handling (Fig. 4a, 4b). These include example fill values." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or other API settings for GPT-3.5 or GPT-4." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The iterative feedback scaffolding is described in detail: Figs. 1 and 2 show the workflow (prompt → compile → simulate → coverage analysis → feedback loop). The compilation check, coverage report parsing, and iterative refinement steps are all documented." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper states FSMs were obtained from HDLBits and GitHub and classified as Easy/Medium/Hard, but provides no selection criteria, filtering steps, or rationale for how the 100 FSMs were chosen from a 'large number' of candidates." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section V 'Conclusions: Key Challenges and Insights' provides substantive discussion of limitations: scalability challenges with large state spaces, difficulty of fine-tuning, lack of labeled testbench datasets, and syntactic differences between RTL and testbenches." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section V discusses threats specific to this study: LLMs producing repetitive responses on complex FSMs, context length limitations preventing comprehension of large I/O traces, and the interdependency between RTL and testbenches being difficult for LLMs to comprehend." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what settings the results do NOT apply to. It does not bound claims to specific FSM sizes, types, or acknowledge that results may not extend to sequential circuits beyond FSMs or to industrial-scale designs." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "Reference [16] provides a GitHub repository with RTL source code and generated testbenches, enabling independent verification of the FSM designs and outputs." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": false, 197 "justification": "The paper states 'we obtained a large number of representative FSMs from HDLBits and GitHub' but does not describe how they were collected, what 'representative' means, or what selection criteria were applied." 198 }, 199 "recruitment_methods_described": { 200 "applies": true, 201 "answer": false, 202 "justification": "No description of how FSMs were selected from HDLBits and GitHub. The paper does not explain sampling strategy, inclusion/exclusion criteria, or whether the selection could introduce bias toward simpler or well-known designs." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "The pipeline from collecting FSMs to the final 100 used in experiments is not documented. There is no description of filtering steps, how complexity levels were assigned, or how many candidate FSMs were considered." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding sources or acknowledgments section is present in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: NYU, NYU Abu Dhabi, and Synopsys. Ramesh Narayanaswamy's Synopsys affiliation is disclosed, relevant since Synopsys EDA tools (VCS, Verdi) are used in the experiments." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding is disclosed so independence cannot be assessed. A Synopsys co-author and use of Synopsys commercial tools (VCS, Verdi) in the evaluation creates a potential conflict that is not addressed." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial disclosure statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for GPT-3.5 or GPT-4. This is relevant because the FSMs from HDLBits are publicly available and could be in the training data." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether the HDLBits or GitHub FSMs might appear in GPT training data. HDLBits is a widely-used public resource that is very likely in the training corpus." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "HDLBits problems have been publicly available since well before GPT-3.5/4 training. The paper does not address the risk that models may have memorized these FSMs and their solutions." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All experiments involve automated LLM-EDA tool interactions." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No API costs, token counts, or latency figures are reported despite using commercial GPT API calls iteratively across 100 FSMs." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No total computational budget is stated — neither total API spend nor EDA tool runtime." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No mention of multiple random seeds or repeated runs. LLM outputs are stochastic, but results appear to be from single runs per FSM." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs per FSM is not stated. It is unclear whether each FSM was tested once or multiple times." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "No hyperparameter search is described. Prompt design appears ad hoc with no description of how the prompts in Figs. 3–4 were developed or selected." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "The final prompt configurations shown in Figs. 3–4 are presented without justification for why these specific formulations were chosen over alternatives." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "Many implicit comparisons are made across 100 FSMs, 4 methods, and 3 bug detection scenarios without any statistical tests, let alone correction for multiple comparisons." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors evaluate their own iterative feedback method against baselines (plain GPT prompting) without acknowledging that they designed both the method and the evaluation." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "The proposed method requires multiple iterations (up to 24 for GPT3.5) compared to single-shot baselines, but the compute cost difference is never quantified or discussed." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper uses FSM transition coverage as the primary quality metric without discussing whether coverage is a valid proxy for testbench quality. Footnote 3 notes that 100% coverage doesn't guarantee bug detection, but this gap is not systematically addressed." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "Model comparisons (GPT3.5 vs GPT4) are made under matched conditions — both without scaffold and both with the same scaffold — so the scaffolding confound is addressed by design." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "HDLBits FSMs have been publicly available since well before GPT model training. The paper does not discuss whether the models may have seen these problems and their solutions during training." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "The models receive full Verilog RTL code that may overlap with training data from public repositories. No discussion of whether this constitutes feature leakage." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the 100 FSMs share structural similarities or whether results on similar FSMs are independent observations." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, or decontamination applied." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "LLM-guided testbench generation with EDA feedback achieves 90–100% FSM transition coverage, compared to 4–75% without feedback.", 371 "evidence": "Table I shows coverage improvements across all 100 FSMs. For example, FSM16 goes from 25% (GPT3.5/GPT4 alone) to 100% (both with feedback). GPT4+method achieves 90–100% on all FSMs. Section IV-B discusses the results.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "GPT-4 requires fewer iterations than GPT-3.5 to achieve target coverage.", 376 "evidence": "Table I: GPT4+method uses 2–14 iterations vs GPT3.5+method using 2–24 iterations. Section IV-B states 'the number of iterations required to achieve that is less for GPT4 versus GPT3.5.'", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "Dividing I/O traces into smaller sets and handling multi-bit outputs individually improves LLM bug detection capability.", 381 "evidence": "Table I columns for GPT3.5+This Work and GPT4+This Work show improved bug detection vs vanilla GPT3.5/GPT4 across all scenarios. Section IV-C describes the prompting improvements (Fig. 4). GPT4+method achieves ✓ on all 100 FSMs across all 3 scenarios.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "GPT-4 combined with the proposed method can detect bugs in FSMs of all complexity levels (Easy through Hard) across all three detection scenarios.", 386 "evidence": "Table I and Table II show ✓ for GPT4+This Work across all 100 FSMs for State Regs, I/O pairs, and Fuzzing scenarios. Section IV-C confirms this.", 387 "supported": "weak" 388 } 389 ], 390 "red_flags": [ 391 { 392 "flag": "No uncertainty quantification", 393 "detail": "All results appear to be single-run with no variance, error bars, or repeated trials. LLM outputs are stochastic, so results could vary significantly across runs, especially for the iterative feedback process." 394 }, 395 { 396 "flag": "Contamination risk unaddressed", 397 "detail": "HDLBits problems are widely available online and very likely in GPT training data. The models may have memorized these FSMs and their solutions, inflating coverage and bug detection results." 398 }, 399 { 400 "flag": "Synopsys conflict of interest", 401 "detail": "A Synopsys co-author (Narayanaswamy) and the exclusive use of Synopsys commercial tools (VCS, Verdi) in the evaluation, with no acknowledgment of this potential conflict or comparison with open-source EDA tools." 402 }, 403 { 404 "flag": "Perfect results on GPT4+method", 405 "detail": "GPT4+This Work achieves ✓ on all 100 FSMs across all 3 bug detection scenarios (Tables I and II). A 100% success rate with no failures seems too clean given the acknowledged complexity of harder FSMs." 406 }, 407 { 408 "flag": "No model version specifics", 409 "detail": "Only 'GPT3.5' and 'GPT4' are stated without API versions or snapshot dates. Model behavior changes significantly across versions, making results non-reproducible." 410 }, 411 { 412 "flag": "Commercial tool dependency limits reproducibility", 413 "detail": "The entire evaluation pipeline requires Synopsys VCS and Verdi, expensive commercial tools not accessible to most researchers, severely limiting independent verification." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "VerilogEval: Evaluating large language models for Verilog code generation", 419 "authors": ["M. Liu"], 420 "year": 2023, 421 "relevance": "Benchmark for evaluating LLM Verilog code generation capability, directly relevant to understanding LLM performance on hardware description tasks." 422 }, 423 { 424 "title": "AutoChip: Automating HDL generation using LLM feedback", 425 "authors": ["S. Thakur"], 426 "year": 2023, 427 "arxiv_id": "2311.04887", 428 "relevance": "Uses iterative LLM feedback for hardware code generation, a closely related approach to this paper's testbench generation method." 429 }, 430 { 431 "title": "RTLLM: An open-source benchmark for design RTL generation with large language model", 432 "authors": ["Y. Lu"], 433 "year": 2024, 434 "relevance": "Benchmark for LLM-based RTL generation, relevant to evaluating LLM capabilities in hardware design." 435 }, 436 { 437 "title": "VeriGen: A large language model for Verilog code generation", 438 "authors": ["S. Thakur"], 439 "year": 2023, 440 "relevance": "LLM specialized for Verilog code generation, relevant to understanding LLM capabilities in hardware description languages." 441 }, 442 { 443 "title": "LLM-assisted generation of hardware assertions", 444 "authors": ["R. Kande"], 445 "year": 2023, 446 "arxiv_id": "2306.14027", 447 "relevance": "Uses LLMs for hardware verification assertion generation, a closely related application in chip design verification." 448 }, 449 { 450 "title": "AssertLLM: Generating and evaluating hardware verification assertions from design specifications via multi-LLMs", 451 "authors": ["W. Fang"], 452 "year": 2024, 453 "arxiv_id": "2402.00386", 454 "relevance": "Multi-LLM approach to hardware verification assertions, relevant to LLM-aided chip design verification workflows." 455 }, 456 { 457 "title": "ChatEDA: A large language model powered autonomous agent for EDA", 458 "authors": ["H. Wu"], 459 "year": 2024, 460 "relevance": "LLM-powered autonomous agent for electronic design automation, relevant to agentic AI workflows in hardware design." 461 }, 462 { 463 "title": "ChipNemo: Domain-adapted LLMs for chip design", 464 "authors": ["M. Liu"], 465 "year": 2023, 466 "arxiv_id": "2311.00176", 467 "relevance": "Domain-adapted LLM for chip design including bug analysis and script generation, directly relevant to LLM capability in hardware engineering." 468 }, 469 { 470 "title": "LLM4EDA: Emerging progress in large language models for electronic design automation", 471 "authors": ["R. Zhong"], 472 "year": 2023, 473 "arxiv_id": "2401.12224", 474 "relevance": "Survey of LLM applications in EDA, provides context for the broader landscape of LLM-aided hardware design." 475 }, 476 { 477 "title": "Chip-Chat: Challenges and opportunities in conversational hardware design", 478 "authors": ["J. Blocklove"], 479 "year": 2023, 480 "relevance": "Explores conversational LLM interfaces for hardware design, relevant to understanding LLM prompting strategies for chip design tasks." 481 }, 482 { 483 "title": "GPT4AIGChip: Towards next-generation AI accelerator design automation via large language models", 484 "authors": ["Y. Fu"], 485 "year": 2023, 486 "relevance": "Uses GPT-4 for AI accelerator design automation, relevant to LLM capability assessment in hardware design." 487 }, 488 { 489 "title": "ChipGPT: How far are we from natural language hardware design", 490 "authors": ["K. Chang"], 491 "year": 2023, 492 "arxiv_id": "2305.14019", 493 "relevance": "Evaluates the gap between natural language and hardware design using LLMs, relevant to understanding LLM limitations in this domain." 494 } 495 ], 496 "engagement_factors": { 497 "practical_relevance": { 498 "score": 2, 499 "justification": "The iterative LLM+EDA feedback approach is practically useful for chip designers, though it requires expensive commercial Synopsys tools." 500 }, 501 "surprise_contrarian": { 502 "score": 0, 503 "justification": "Results confirm the expected finding that iterative feedback improves LLM output quality; no surprising or contrarian conclusions." 504 }, 505 "fear_safety": { 506 "score": 0, 507 "justification": "No safety, security, or AI risk concerns raised by the work." 508 }, 509 "drama_conflict": { 510 "score": 0, 511 "justification": "No controversial claims or conflicts with other research." 512 }, 513 "demo_ability": { 514 "score": 1, 515 "justification": "GitHub repo with FSM designs and testbenches exists, but full reproduction requires commercial Synopsys VCS/Verdi licenses." 516 }, 517 "brand_recognition": { 518 "score": 1, 519 "justification": "Uses well-known GPT models, but the research group and Synopsys collaboration are not widely recognized outside EDA circles." 520 } 521 } 522 }