scan-v5.json (22992B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "HaVen: Hallucination-Mitigated LLM for Verilog Code Generation Aligned with HDL Engineers", 6 "authors": [ 7 "Yiyao Yang", 8 "Fu Teng", 9 "Pengju Liu", 10 "Mengnan Qi", 11 "Chenyang Lv", 12 "Ji Li", 13 "Xuhong Zhang", 14 "Zhezhi He" 15 ], 16 "year": 2025, 17 "venue": "Design, Automation and Test in Europe", 18 "arxiv_id": "2501.04908", 19 "doi": "10.23919/DATE64628.2025.10993072" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "All abstract claims (performance improvements, outperforming baselines, correctness gains) are directly supported by Table IV results.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "Ablation studies (Fig. 3, Table V, Table VI) demonstrate that SI-CoT, K-dataset, and L-dataset each causally contribute to performance improvements.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": false, 38 "justification": "Title claims 'alignment with HDL engineers' but evaluation is only on three benchmarks (VerilogEval, RTLLM). No human evaluation with actual engineers; claims exceed evidence scope.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "Paper presents methodology and results but does not discuss why SI-CoT works, alternative mechanisms, or competing explanations for observed improvements.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "Pass@k metric is clearly distinguished from other quality measures; measured correctness is functional/syntactic validation, claimed correctness is same—appropriate proxy.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": false, 58 "justification": "No dedicated limitations or threats-to-validity section. Conclusion is brief and does not discuss limitations.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": false, 64 "justification": "No specific threats discussed. No discussion of why only benchmarks were evaluated, potential generalization failures, or design limitations.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": false, 70 "justification": "No explicit boundaries stated for what the results do not show. No discussion of settings, model sizes, or task types where method may fail.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Paper explicitly states funding from National Key R&D Program of China (2022YFB4500200) and National Natural Science Foundation of China (No.62102257).", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations listed: Shanghai Jiao Tong University, Zhejiang University, with specific schools and departments.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": true, 89 "answer": true, 90 "justification": "Funding is from government research programs, independent of specific method/results outcomes.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests statement. No mention of patents, equity, consulting, or other financial relationships.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": false, 104 "justification": "Hallucination is defined; Verilog and HDL engineers assumed known. 'Correctness' not formally defined (uses standard pass@k without justifying its validity as correctness measure).", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "Introduction clearly states three contributions: hallucination taxonomy, SI-CoT mechanism, and data augmentation strategy for HDL-engineer alignment.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Paper discusses CodeHalu, HalluCode, RTLFixer, RTLCoder, OriGen, AutoVCoder, showing positioning and how HAVEN differs. Table IV directly compares against these works.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "Paper states 'HAVEN is publicly available at https://github.com/Intelligent-Computing-Research-Group/HaVen'", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": false, 133 "justification": "K-dataset (14k pairs) and L-dataset (5k pairs) are referenced but not explicitly confirmed as released. Public GitHub Verilog sources used but processed datasets not confirmed available.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "Hyperparameters and GPU hardware specified, but no requirements.txt, Dockerfile, Python version, or dependency specs provided for reproducibility.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "Methodology described but no step-by-step reproduction guide. Readers cannot easily replicate fine-tuning or evaluation without external effort.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": false, 153 "justification": "All results are point estimates (e.g., '78.8% pass@1') with no confidence intervals, error bars, or variance estimates.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "Comparative claims throughout (e.g., HAVEN outperforms OriGen by 4.7%) but no p-values, t-tests, or statistical significance testing.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Effect sizes given as percentage point improvements: '6.7% increase in pass@1 and 4.7% increase in pass@5' compared to OriGen.", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "VerilogEval has 299 total tasks, RTLLM has 29. No justification for why these sample sizes are adequate or any power analysis.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": false, 177 "justification": "Results reported as single point estimates. Paper mentions temperature sweep (0.2, 0.5, 0.8) but only reports 'best performance' without showing variance across runs or which temperature was optimal.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Table IV includes 10+ baselines: GPT-3.5, GPT-4, StarCoder, CodeLlama, DeepSeek, CodeQwen, RTLCoder, OriGen, AutoVCoder, BetterV.", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "All baselines are from 2023-2024, contemporary with this 2025 paper. GPT-4 and recent LLMs included.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "Fig. 3 ablates each component: Base, Vanilla, Vanilla+CoT, Vanilla+KL, Vanilla+CoT+KL. Fig. 4 ablates K/L dataset composition.", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Pass@1 and pass@5 reported. Syntax and functional correctness measured separately.", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": true, 208 "answer": false, 209 "justification": "Paper evaluates on human-created benchmarks (VerilogEval-Human) but does not conduct independent human evaluation of their generated code or alignment with engineers.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "Evaluation on standard benchmarks (VerilogEval v1, v2, RTLLM v1.1) which are held-out test sets.", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Table V breaks down performance by symbolic modality (truth table 60%, waveform 30.8%, state diagram 52.4%). Limited categorical breakdowns elsewhere.", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": false, 227 "justification": "No discussion of failure modes, error analysis, or types of problems HAVEN still cannot solve.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "Paper notes CodeLlama performs worse after fine-tuning: 'After fine-tuning, CodeLlama performs worse than the other two models, which aligns with the experimental results reported in other study.'", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": true, 241 "justification": "Exact model versions specified: CodeLlama-7b-Instruct, Deepseek-Coder-6.7b-Instruct, CodeQwen1.5-7B-Chat. No commit hashes but reasonable specificity for LLMs.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": false, 247 "justification": "SI-CoT process described in Section III-B with examples in Table III, but full prompt templates not provided. Templates with placeholders are described, not complete prompts.", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": true, 253 "justification": "Learning rate 5e-5, optimizer AdamW, scheduler cosine, batch size 256, epochs 3, warmup 15 iterations, temperature {0.2, 0.5, 0.8} all reported.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "SI-CoT scaffolding mechanism detailed in Section III-B with step-by-step breakdown (identify symbolic, parse modalities, add module header) and examples.", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "K-dataset and L-dataset generation pipelines documented in Sections III-C and III-D with steps, methods (parser slang, GPT-3.5 rewriting, compiler verification).", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "Evaluation benchmarks (VerilogEval, RTLLM) are public. 550k GitHub Verilog samples are public. But the curated K-dataset and L-dataset are not confirmed released.", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "550k samples from GitHub described, curated exemplars from textbooks [19][23][25] identified, GPT-3.5 generation process explained.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "N/A - no human participants recruited. Benchmarks are pre-existing datasets.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": true, 291 "justification": "K-dataset pipeline (steps 4-8) and L-dataset pipeline (steps 9-12) thoroughly documented with parser, compiler verification, and augmentation methods.", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Base model training cutoffs not stated. Paper does not disclose when CodeLlama, DeepSeek, and CodeQwen were trained.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No discussion of whether benchmark examples appear in base model training data or fine-tuning contamination.", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of benchmark creation dates relative to model training or cutoff dates. Contamination risk not addressed.", 312 "source": "haiku" 313 } 314 }, 315 "cost_and_practicality": { 316 "inference_cost_reported": { 317 "applies": true, 318 "answer": false, 319 "justification": "No inference latency, cost per query, or throughput metrics reported.", 320 "source": "haiku" 321 }, 322 "compute_budget_stated": { 323 "applies": true, 324 "answer": false, 325 "justification": "Hardware specified (2x A100-80GB, 3 epochs) but total computation time, cost, or memory requirements not stated.", 326 "source": "haiku" 327 } 328 } 329 } 330 }, 331 "claims": [ 332 { 333 "claim": "HAVEN-DeepSeek achieves 78.8% pass@1 on VerilogEval-Human, outperforming OriGen (74.1%)", 334 "evidence": "Table IV, row 'HAVEN-DeepSeek' vs 'OriGen-DeepSeek-7B-v1.5'", 335 "supported": "strong" 336 }, 337 { 338 "claim": "SI-CoT alone improves pass@1 by 3.6% and pass@5 by 6.6% on average across base models", 339 "evidence": "Fig. 3, ablation comparing 'Vanilla' vs 'Vanilla+CoT' across CodeLlama, DeepSeek, CodeQwen", 340 "supported": "strong" 341 }, 342 { 343 "claim": "Fine-tuning with KL-dataset improves pass@1 by 12.3% and pass@5 by 8.7% on average", 344 "evidence": "Fig. 3, ablation comparing 'Vanilla' vs 'Vanilla+KL'", 345 "supported": "strong" 346 }, 347 { 348 "claim": "HAVEN-CodeQwen achieves 47.4% pass@1 on symbolic modality tasks (truth tables, waveforms, state diagrams), outperforming OriGen and GPT-4 (both 22.7%)", 349 "evidence": "Table V, results on 44 curated tasks with symbolic components", 350 "supported": "strong" 351 }, 352 { 353 "claim": "HAVEN addresses hallucinations through three orthogonal types: symbolic, knowledge, and logical", 354 "evidence": "Section II, Table II taxonomy with examples for each type", 355 "supported": "moderate" 356 }, 357 { 358 "claim": "The framework aligns generated code with HDL engineer practices by using domain exemplars and data augmentation", 359 "evidence": "Section III-C describes curated exemplars from textbooks [19][23][25] and Verilog design conventions", 360 "supported": "weak" 361 } 362 ], 363 "methodology_tags": [ 364 "benchmark-eval" 365 ], 366 "key_findings": "HAVEN improves Verilog code generation through three complementary techniques: symbolic-interpretation chain-of-thought (SI-CoT) that converts diagrams/tables to natural language, a knowledge-enhanced dataset (K-dataset, 14k pairs) from textbook exemplars, and a logical-enhanced dataset (L-dataset, 5k pairs) with synthetic reasoning examples. On VerilogEval-Human, HAVEN-DeepSeek achieves 78.8% pass@1 vs. OriGen's 74.1%. Most notably, HAVEN-CodeQwen reaches 47.4% on symbolic modality tasks (truth tables, state diagrams, waveforms) vs. 22.7% for competing methods, suggesting chain-of-thought reasoning substantially reduces hallucinations in hardware specifications.", 367 "red_flags": [ 368 { 369 "flag": "No statistical significance testing", 370 "detail": "All performance improvements (4.7-6.7pp) reported as point estimates with no confidence intervals, p-values, or significance tests. Unclear if differences are statistically reliable." 371 }, 372 { 373 "flag": "No human evaluation of outputs", 374 "detail": "Paper claims 'alignment with HDL engineers' but only evaluates on benchmarks. No human engineers assessed whether generated code actually aligns with real-world practices." 375 }, 376 { 377 "flag": "No failure case analysis", 378 "detail": "Only positive results reported. No discussion of failure modes, error types, or problem categories where HAVEN still underperforms." 379 }, 380 { 381 "flag": "Contamination not addressed", 382 "detail": "Base model training cutoffs not stated. Unknown whether VerilogEval/RTLLM benchmark examples appear in CodeLlama, DeepSeek, or CodeQwen training data." 383 }, 384 { 385 "flag": "Temperature selection not transparent", 386 "detail": "Paper reports 'best performance' from temperatures {0.2, 0.5, 0.8} without stating which was optimal. Allows implicit cherry-picking." 387 }, 388 { 389 "flag": "Limited dataset release transparency", 390 "detail": "K-dataset (14k) and L-dataset (5k) not explicitly confirmed as released. Reproducibility may be limited without access to augmented datasets." 391 }, 392 { 393 "flag": "Ablation limited to one base model", 394 "detail": "Ablation study (Fig. 3, 4) primarily on CodeQwen. Generalization of component contributions to other base models unclear." 395 }, 396 { 397 "flag": "Symbolic modality evaluation subset", 398 "detail": "Table V evaluation uses only 44 curated tasks from VerilogEval-Human. Not a systematic sample—potential bias toward problems SI-CoT handles well." 399 }, 400 { 401 "flag": "No variance reporting across runs", 402 "detail": "Single point estimates reported; no standard deviation or confidence intervals even though stochastic generation could cause variance." 403 }, 404 { 405 "flag": "Scope-claims mismatch", 406 "detail": "Title claims 'Aligned with HDL Engineers' but validation is purely benchmark-based (VerilogEval, RTLLM). No alignment with actual engineers validated." 407 } 408 ], 409 "cited_papers": [ 410 { 411 "title": "Investigating code hallucinations in llms via execution-based verification", 412 "authors": "Tian et al. (CodeHalu)", 413 "year": 2024, 414 "relevance": "Directly addresses hallucination detection in code generation; foundational work on the problem HAVEN tackles." 415 }, 416 { 417 "title": "Exploring and evaluating hallucinations in llm-powered code generation", 418 "authors": "Liu et al. (HalluCode)", 419 "year": 2024, 420 "relevance": "Parallel work on hallucination taxonomy in code generation; core motivation for HAVEN." 421 }, 422 { 423 "title": "Verilogeval: Evaluating large language models for verilog code generation", 424 "authors": "Liu, Pinckney, Khailany, Ren", 425 "year": 2023, 426 "relevance": "Main benchmark used for evaluation. Establishes VerilogEval-Human and VerilogEval-Machine datasets." 427 }, 428 { 429 "title": "OriGen: Enhancing rtl code generation with code-to-code augmentation and self-reflection", 430 "authors": "Cui et al.", 431 "year": 2024, 432 "relevance": "Primary baseline and competing approach. Demonstrates prior data augmentation strategy that HAVEN improves upon." 433 }, 434 { 435 "title": "RTLCoder: Outperforming GPT-3.5 in design RTL generation with our open-source dataset and lightweight solution", 436 "authors": "Liu, Fang, Lu, Zhang, Zhang, Xie", 437 "year": 2024, 438 "relevance": "Prior Verilog-specific LLM approach; baseline for comparison." 439 }, 440 { 441 "title": "AutoVCoder: A systematic framework for automated verilog code generation using llms", 442 "authors": "Gao et al.", 443 "year": 2024, 444 "relevance": "State-of-the-art competing framework using large-scale synthetic data generation; key baseline." 445 }, 446 { 447 "title": "WizardLM: Empowering large language models to follow complex instructions", 448 "authors": "Xu, Wang, Liu, Ding, Zhang", 449 "year": 2024, 450 "relevance": "Instruction evolution methodology adapted by HAVEN for L-dataset generation." 451 }, 452 { 453 "title": "Revisiting verilogeval: Newer llms, in-context learning, and specification-to-rtl tasks", 454 "authors": "Pinckney, Batten, Liu, Ren, Khailany", 455 "year": 2024, 456 "relevance": "VerilogEval v2 benchmark with human-aligned prompts; updated evaluation target." 457 } 458 ], 459 "engagement_factors": { 460 "practical_relevance": { 461 "score": 2, 462 "justification": "Code released and usable, but requires setting up fine-tuning infrastructure. Benchmark-only evaluation leaves real-world applicability uncertain for actual hardware engineers." 463 }, 464 "surprise_contrarian": { 465 "score": 1, 466 "justification": "Chain-of-thought for symbolic reasoning is intuitive and expected. No surprising findings or contrarian claims." 467 }, 468 "fear_safety": { 469 "score": 0, 470 "justification": "No AI safety, security, or alignment concerns raised. Technical contribution only." 471 }, 472 "drama_conflict": { 473 "score": 0, 474 "justification": "Straightforward technical paper. No controversy, industry conflict, or dramatic narrative." 475 }, 476 "demo_ability": { 477 "score": 2, 478 "justification": "Code available on GitHub but requires GPU setup, fine-tuning on 62k examples, and evaluation infrastructure. Not a quick-try demo." 479 }, 480 "brand_recognition": { 481 "score": 1, 482 "justification": "Chinese universities (Shanghai Jiao Tong, Zhejiang) are strong in CS but not globally top-tier like Stanford/MIT. Limited brand reach in Western ML community." 483 } 484 }, 485 "hn_data": { 486 "threads": [], 487 "top_points": 0, 488 "total_points": 0, 489 "total_comments": 0 490 } 491 }