scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22992B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HaVen: Hallucination-Mitigated LLM for Verilog Code Generation Aligned with HDL Engineers",
      6     "authors": [
      7       "Yiyao Yang",
      8       "Fu Teng",
      9       "Pengju Liu",
     10       "Mengnan Qi",
     11       "Chenyang Lv",
     12       "Ji Li",
     13       "Xuhong Zhang",
     14       "Zhezhi He"
     15     ],
     16     "year": 2025,
     17     "venue": "Design, Automation and Test in Europe",
     18     "arxiv_id": "2501.04908",
     19     "doi": "10.23919/DATE64628.2025.10993072"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All abstract claims (performance improvements, outperforming baselines, correctness gains) are directly supported by Table IV results.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Ablation studies (Fig. 3, Table V, Table VI) demonstrate that SI-CoT, K-dataset, and L-dataset each causally contribute to performance improvements.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Title claims 'alignment with HDL engineers' but evaluation is only on three benchmarks (VerilogEval, RTLLM). No human evaluation with actual engineers; claims exceed evidence scope.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Paper presents methodology and results but does not discuss why SI-CoT works, alternative mechanisms, or competing explanations for observed improvements.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Pass@k metric is clearly distinguished from other quality measures; measured correctness is functional/syntactic validation, claimed correctness is same—appropriate proxy.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No dedicated limitations or threats-to-validity section. Conclusion is brief and does not discuss limitations.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No specific threats discussed. No discussion of why only benchmarks were evaluated, potential generalization failures, or design limitations.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No explicit boundaries stated for what the results do not show. No discussion of settings, model sizes, or task types where method may fail.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Paper explicitly states funding from National Key R&D Program of China (2022YFB4500200) and National Natural Science Foundation of China (No.62102257).",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations listed: Shanghai Jiao Tong University, Zhejiang University, with specific schools and departments.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Funding is from government research programs, independent of specific method/results outcomes.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement. No mention of patents, equity, consulting, or other financial relationships.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "Hallucination is defined; Verilog and HDL engineers assumed known. 'Correctness' not formally defined (uses standard pass@k without justifying its validity as correctness measure).",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Introduction clearly states three contributions: hallucination taxonomy, SI-CoT mechanism, and data augmentation strategy for HDL-engineer alignment.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Paper discusses CodeHalu, HalluCode, RTLFixer, RTLCoder, OriGen, AutoVCoder, showing positioning and how HAVEN differs. Table IV directly compares against these works.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Paper states 'HAVEN is publicly available at https://github.com/Intelligent-Computing-Research-Group/HaVen'",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "K-dataset (14k pairs) and L-dataset (5k pairs) are referenced but not explicitly confirmed as released. Public GitHub Verilog sources used but processed datasets not confirmed available.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Hyperparameters and GPU hardware specified, but no requirements.txt, Dockerfile, Python version, or dependency specs provided for reproducibility.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "Methodology described but no step-by-step reproduction guide. Readers cannot easily replicate fine-tuning or evaluation without external effort.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "All results are point estimates (e.g., '78.8% pass@1') with no confidence intervals, error bars, or variance estimates.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Comparative claims throughout (e.g., HAVEN outperforms OriGen by 4.7%) but no p-values, t-tests, or statistical significance testing.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Effect sizes given as percentage point improvements: '6.7% increase in pass@1 and 4.7% increase in pass@5' compared to OriGen.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "VerilogEval has 299 total tasks, RTLLM has 29. No justification for why these sample sizes are adequate or any power analysis.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Results reported as single point estimates. Paper mentions temperature sweep (0.2, 0.5, 0.8) but only reports 'best performance' without showing variance across runs or which temperature was optimal.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Table IV includes 10+ baselines: GPT-3.5, GPT-4, StarCoder, CodeLlama, DeepSeek, CodeQwen, RTLCoder, OriGen, AutoVCoder, BetterV.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "All baselines are from 2023-2024, contemporary with this 2025 paper. GPT-4 and recent LLMs included.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Fig. 3 ablates each component: Base, Vanilla, Vanilla+CoT, Vanilla+KL, Vanilla+CoT+KL. Fig. 4 ablates K/L dataset composition.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Pass@1 and pass@5 reported. Syntax and functional correctness measured separately.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": false,
    209           "justification": "Paper evaluates on human-created benchmarks (VerilogEval-Human) but does not conduct independent human evaluation of their generated code or alignment with engineers.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Evaluation on standard benchmarks (VerilogEval v1, v2, RTLLM v1.1) which are held-out test sets.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table V breaks down performance by symbolic modality (truth table 60%, waveform 30.8%, state diagram 52.4%). Limited categorical breakdowns elsewhere.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": false,
    227           "justification": "No discussion of failure modes, error analysis, or types of problems HAVEN still cannot solve.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Paper notes CodeLlama performs worse after fine-tuning: 'After fine-tuning, CodeLlama performs worse than the other two models, which aligns with the experimental results reported in other study.'",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Exact model versions specified: CodeLlama-7b-Instruct, Deepseek-Coder-6.7b-Instruct, CodeQwen1.5-7B-Chat. No commit hashes but reasonable specificity for LLMs.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "SI-CoT process described in Section III-B with examples in Table III, but full prompt templates not provided. Templates with placeholders are described, not complete prompts.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Learning rate 5e-5, optimizer AdamW, scheduler cosine, batch size 256, epochs 3, warmup 15 iterations, temperature {0.2, 0.5, 0.8} all reported.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "SI-CoT scaffolding mechanism detailed in Section III-B with step-by-step breakdown (identify symbolic, parse modalities, add module header) and examples.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "K-dataset and L-dataset generation pipelines documented in Sections III-C and III-D with steps, methods (parser slang, GPT-3.5 rewriting, compiler verification).",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "Evaluation benchmarks (VerilogEval, RTLLM) are public. 550k GitHub Verilog samples are public. But the curated K-dataset and L-dataset are not confirmed released.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "550k samples from GitHub described, curated exemplars from textbooks [19][23][25] identified, GPT-3.5 generation process explained.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "N/A - no human participants recruited. Benchmarks are pre-existing datasets.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "K-dataset pipeline (steps 4-8) and L-dataset pipeline (steps 9-12) thoroughly documented with parser, compiler verification, and augmentation methods.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "Base model training cutoffs not stated. Paper does not disclose when CodeLlama, DeepSeek, and CodeQwen were trained.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "No discussion of whether benchmark examples appear in base model training data or fine-tuning contamination.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "No discussion of benchmark creation dates relative to model training or cutoff dates. Contamination risk not addressed.",
    312           "source": "haiku"
    313         }
    314       },
    315       "cost_and_practicality": {
    316         "inference_cost_reported": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "No inference latency, cost per query, or throughput metrics reported.",
    320           "source": "haiku"
    321         },
    322         "compute_budget_stated": {
    323           "applies": true,
    324           "answer": false,
    325           "justification": "Hardware specified (2x A100-80GB, 3 epochs) but total computation time, cost, or memory requirements not stated.",
    326           "source": "haiku"
    327         }
    328       }
    329     }
    330   },
    331   "claims": [
    332     {
    333       "claim": "HAVEN-DeepSeek achieves 78.8% pass@1 on VerilogEval-Human, outperforming OriGen (74.1%)",
    334       "evidence": "Table IV, row 'HAVEN-DeepSeek' vs 'OriGen-DeepSeek-7B-v1.5'",
    335       "supported": "strong"
    336     },
    337     {
    338       "claim": "SI-CoT alone improves pass@1 by 3.6% and pass@5 by 6.6% on average across base models",
    339       "evidence": "Fig. 3, ablation comparing 'Vanilla' vs 'Vanilla+CoT' across CodeLlama, DeepSeek, CodeQwen",
    340       "supported": "strong"
    341     },
    342     {
    343       "claim": "Fine-tuning with KL-dataset improves pass@1 by 12.3% and pass@5 by 8.7% on average",
    344       "evidence": "Fig. 3, ablation comparing 'Vanilla' vs 'Vanilla+KL'",
    345       "supported": "strong"
    346     },
    347     {
    348       "claim": "HAVEN-CodeQwen achieves 47.4% pass@1 on symbolic modality tasks (truth tables, waveforms, state diagrams), outperforming OriGen and GPT-4 (both 22.7%)",
    349       "evidence": "Table V, results on 44 curated tasks with symbolic components",
    350       "supported": "strong"
    351     },
    352     {
    353       "claim": "HAVEN addresses hallucinations through three orthogonal types: symbolic, knowledge, and logical",
    354       "evidence": "Section II, Table II taxonomy with examples for each type",
    355       "supported": "moderate"
    356     },
    357     {
    358       "claim": "The framework aligns generated code with HDL engineer practices by using domain exemplars and data augmentation",
    359       "evidence": "Section III-C describes curated exemplars from textbooks [19][23][25] and Verilog design conventions",
    360       "supported": "weak"
    361     }
    362   ],
    363   "methodology_tags": [
    364     "benchmark-eval"
    365   ],
    366   "key_findings": "HAVEN improves Verilog code generation through three complementary techniques: symbolic-interpretation chain-of-thought (SI-CoT) that converts diagrams/tables to natural language, a knowledge-enhanced dataset (K-dataset, 14k pairs) from textbook exemplars, and a logical-enhanced dataset (L-dataset, 5k pairs) with synthetic reasoning examples. On VerilogEval-Human, HAVEN-DeepSeek achieves 78.8% pass@1 vs. OriGen's 74.1%. Most notably, HAVEN-CodeQwen reaches 47.4% on symbolic modality tasks (truth tables, state diagrams, waveforms) vs. 22.7% for competing methods, suggesting chain-of-thought reasoning substantially reduces hallucinations in hardware specifications.",
    367   "red_flags": [
    368     {
    369       "flag": "No statistical significance testing",
    370       "detail": "All performance improvements (4.7-6.7pp) reported as point estimates with no confidence intervals, p-values, or significance tests. Unclear if differences are statistically reliable."
    371     },
    372     {
    373       "flag": "No human evaluation of outputs",
    374       "detail": "Paper claims 'alignment with HDL engineers' but only evaluates on benchmarks. No human engineers assessed whether generated code actually aligns with real-world practices."
    375     },
    376     {
    377       "flag": "No failure case analysis",
    378       "detail": "Only positive results reported. No discussion of failure modes, error types, or problem categories where HAVEN still underperforms."
    379     },
    380     {
    381       "flag": "Contamination not addressed",
    382       "detail": "Base model training cutoffs not stated. Unknown whether VerilogEval/RTLLM benchmark examples appear in CodeLlama, DeepSeek, or CodeQwen training data."
    383     },
    384     {
    385       "flag": "Temperature selection not transparent",
    386       "detail": "Paper reports 'best performance' from temperatures {0.2, 0.5, 0.8} without stating which was optimal. Allows implicit cherry-picking."
    387     },
    388     {
    389       "flag": "Limited dataset release transparency",
    390       "detail": "K-dataset (14k) and L-dataset (5k) not explicitly confirmed as released. Reproducibility may be limited without access to augmented datasets."
    391     },
    392     {
    393       "flag": "Ablation limited to one base model",
    394       "detail": "Ablation study (Fig. 3, 4) primarily on CodeQwen. Generalization of component contributions to other base models unclear."
    395     },
    396     {
    397       "flag": "Symbolic modality evaluation subset",
    398       "detail": "Table V evaluation uses only 44 curated tasks from VerilogEval-Human. Not a systematic sample—potential bias toward problems SI-CoT handles well."
    399     },
    400     {
    401       "flag": "No variance reporting across runs",
    402       "detail": "Single point estimates reported; no standard deviation or confidence intervals even though stochastic generation could cause variance."
    403     },
    404     {
    405       "flag": "Scope-claims mismatch",
    406       "detail": "Title claims 'Aligned with HDL Engineers' but validation is purely benchmark-based (VerilogEval, RTLLM). No alignment with actual engineers validated."
    407     }
    408   ],
    409   "cited_papers": [
    410     {
    411       "title": "Investigating code hallucinations in llms via execution-based verification",
    412       "authors": "Tian et al. (CodeHalu)",
    413       "year": 2024,
    414       "relevance": "Directly addresses hallucination detection in code generation; foundational work on the problem HAVEN tackles."
    415     },
    416     {
    417       "title": "Exploring and evaluating hallucinations in llm-powered code generation",
    418       "authors": "Liu et al. (HalluCode)",
    419       "year": 2024,
    420       "relevance": "Parallel work on hallucination taxonomy in code generation; core motivation for HAVEN."
    421     },
    422     {
    423       "title": "Verilogeval: Evaluating large language models for verilog code generation",
    424       "authors": "Liu, Pinckney, Khailany, Ren",
    425       "year": 2023,
    426       "relevance": "Main benchmark used for evaluation. Establishes VerilogEval-Human and VerilogEval-Machine datasets."
    427     },
    428     {
    429       "title": "OriGen: Enhancing rtl code generation with code-to-code augmentation and self-reflection",
    430       "authors": "Cui et al.",
    431       "year": 2024,
    432       "relevance": "Primary baseline and competing approach. Demonstrates prior data augmentation strategy that HAVEN improves upon."
    433     },
    434     {
    435       "title": "RTLCoder: Outperforming GPT-3.5 in design RTL generation with our open-source dataset and lightweight solution",
    436       "authors": "Liu, Fang, Lu, Zhang, Zhang, Xie",
    437       "year": 2024,
    438       "relevance": "Prior Verilog-specific LLM approach; baseline for comparison."
    439     },
    440     {
    441       "title": "AutoVCoder: A systematic framework for automated verilog code generation using llms",
    442       "authors": "Gao et al.",
    443       "year": 2024,
    444       "relevance": "State-of-the-art competing framework using large-scale synthetic data generation; key baseline."
    445     },
    446     {
    447       "title": "WizardLM: Empowering large language models to follow complex instructions",
    448       "authors": "Xu, Wang, Liu, Ding, Zhang",
    449       "year": 2024,
    450       "relevance": "Instruction evolution methodology adapted by HAVEN for L-dataset generation."
    451     },
    452     {
    453       "title": "Revisiting verilogeval: Newer llms, in-context learning, and specification-to-rtl tasks",
    454       "authors": "Pinckney, Batten, Liu, Ren, Khailany",
    455       "year": 2024,
    456       "relevance": "VerilogEval v2 benchmark with human-aligned prompts; updated evaluation target."
    457     }
    458   ],
    459   "engagement_factors": {
    460     "practical_relevance": {
    461       "score": 2,
    462       "justification": "Code released and usable, but requires setting up fine-tuning infrastructure. Benchmark-only evaluation leaves real-world applicability uncertain for actual hardware engineers."
    463     },
    464     "surprise_contrarian": {
    465       "score": 1,
    466       "justification": "Chain-of-thought for symbolic reasoning is intuitive and expected. No surprising findings or contrarian claims."
    467     },
    468     "fear_safety": {
    469       "score": 0,
    470       "justification": "No AI safety, security, or alignment concerns raised. Technical contribution only."
    471     },
    472     "drama_conflict": {
    473       "score": 0,
    474       "justification": "Straightforward technical paper. No controversy, industry conflict, or dramatic narrative."
    475     },
    476     "demo_ability": {
    477       "score": 2,
    478       "justification": "Code available on GitHub but requires GPU setup, fine-tuning on 62k examples, and evaluation infrastructure. Not a quick-try demo."
    479     },
    480     "brand_recognition": {
    481       "score": 1,
    482       "justification": "Chinese universities (Shanghai Jiao Tong, Zhejiang) are strong in CS but not globally top-tier like Stanford/MIT. Limited brand reach in Western ML community."
    483     }
    484   },
    485   "hn_data": {
    486     "threads": [],
    487     "top_points": 0,
    488     "total_points": 0,
    489     "total_comments": 0
    490   }
    491 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs