scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29853B)
      1 {
      2   "paper": {
      3     "title": "HAVEN: Hallucination-Mitigated LLM for Verilog Code Generation Aligned with HDL Engineers",
      4     "authors": [
      5       "Yiyao Yang",
      6       "Fu Teng",
      7       "Pengju Liu",
      8       "Mengnan Qi",
      9       "Chenyang Lv",
     10       "Ji Li",
     11       "Xuhong Zhang",
     12       "Zhezhi He"
     13     ],
     14     "year": 2025,
     15     "venue": "Design, Automation and Test in Europe",
     16     "arxiv_id": "2501.04908",
     17     "doi": "10.23919/DATE64628.2025.10993072"
     18   },
     19   "scan_version": 3,
     20   "active_modules": [
     21     "experimental_rigor",
     22     "data_leakage"
     23   ],
     24   "methodology_tags": [
     25     "benchmark-eval"
     26   ],
     27   "key_findings": "HAVEN introduces a hallucination taxonomy for Verilog code generation (symbolic, knowledge, logical) and addresses each type through a three-stage methodology: symbolic interpretation chain-of-thought (SI-CoT), knowledge-enhanced dataset, and logical-enhanced dataset. Fine-tuned on ~62K data samples across three base models (CodeLlama-7B, DeepSeek-Coder-6.7B, CodeQwen-7B), HAVEN achieves up to 61.1% pass@1 on VerilogEval-Human, outperforming OriGen by 6.7 percentage points. On symbolic modality tasks specifically, HAVEN-CodeQwen achieves 47.4% pass@1, substantially outperforming GPT-4 (22.7%) and DeepSeek-Coder-V2 (34.1%).",
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The abstract states 'HAVEN is publicly available at https://github.com/Intelligent-Computing-Research-Group/HaVen' with a direct GitHub URL."
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The evaluation benchmarks (VerilogEval v1, RTLLM v1.1, VerilogEval v2) are publicly available. The training data pipeline is described but the synthesized KL-dataset release is not explicitly confirmed in the paper — however, the evaluation data is standard public benchmarks."
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper mentions 'two Nvidia A100-80GB GPUs' and some training hyperparameters but provides no requirements.txt, Dockerfile, or detailed library/framework versions sufficient to recreate the environment."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is referenced but the paper itself contains no reproduction guide."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "All results in Tables IV, V, and Figures 3-4 report only point estimates (e.g., '61.1%') with no confidence intervals or error bars."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper claims 'outperforms' and 'significantly improves' based solely on comparing point estimates in Tables IV and V. No statistical significance tests (p-values, t-tests, etc.) are reported."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper reports percentage improvements with baseline context, e.g., 'up to a 6.7% increase in pass@1' over OriGen, and '4.7% increase in pass@5' (Section IV-B). Absolute values are provided for both methods."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No justification is given for why n=10 trials for pass@k, or why the specific benchmark sizes (143/156/29 tasks) are sufficient for the claims made."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The pass@k formula uses n=10 but only point estimates are shown."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table IV includes extensive baselines: general LLMs (GPT-3.5, GPT-4, StarCoder, CodeLlama, DeepSeek-Coder, CodeQwen) and Verilog-specific models (ChipNeMo, RTLCoder, BetterV, AutoVCoder, OriGen)."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Baselines include OriGen (2024), AutoVCoder (2024), BetterV (2024), DeepSeek-Coder-V2 (2024), and RTLCoder (2024), which represent the contemporary state of the art in Verilog code generation."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Figure 3 shows a five-setting ablation (base, vanilla, vanilla+CoT, vanilla+KL, vanilla+CoT+KL) across all three base models. Figure 4 shows the effect of K-dataset and L-dataset composition independently."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper reports pass@1 and pass@5 for functional correctness, plus syntax correctness (syntax pass@5) on RTLLM (Table IV)."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Evaluation is entirely automated through pass@k on benchmark test suites. No human evaluation of generated Verilog code quality is included, despite claims of alignment with HDL engineer practices."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The evaluation uses established external benchmarks (VerilogEval v1, RTLLM v1.1, VerilogEval v2) that are separate from the fine-tuning data (synthesized KL-dataset from GitHub code)."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table V breaks down performance by symbolic modality type (truth table, waveform, state diagram). Table IV breaks down by benchmark. Figure 4 analyzes K-dataset vs L-dataset contributions separately."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "Table II shows hallucination examples as motivation but these are not failure cases of HAVEN itself. No discussion of where HAVEN fails or error analysis of HAVEN's incorrect outputs."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper reports that CodeLlama 'performs worse than the other two models' after fine-tuning (Section IV-B), and acknowledges HAVEN-DeepSeek's syntax pass@5 is '4-6% lower' than AutoVCoder despite using a much smaller dataset."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The abstract claims 'significantly improves the correctness of Verilog code generation, outperforming state-of-the-art LLM-based Verilog generation methods on VerilogEval and RTLLM benchmark.' Table IV supports these claims with HAVEN variants achieving top results across all three benchmarks."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Causal claims that SI-CoT and KL-dataset 'improve' performance are supported by the ablation study (Fig. 3), which adds components incrementally in a controlled manner. The ablation design is adequate for these causal claims."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The title claims alignment 'with HDL Engineers' but evaluation is limited to automated benchmarks. Claims about 'practices of HDL engineers' (Section I) extend beyond what pass@k on benchmarks can demonstrate. No actual HDL engineers evaluated the outputs."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as the effect of additional training data volume, possible benchmark-specific overfitting, or whether improvements come from the framework vs. the data augmentation alone."
    145       },
    146       "proxy_outcome_distinction": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper measures pass@k on benchmarks but frames results as 'aligned with practices of HDL engineers' and 'suitable for the use cases of prompts generated by engineers.' No discussion of the gap between benchmark pass@k and actual HDL engineering utility."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Base models are specified as 'CodeLlama-7b-Instruct', 'Deepseek-Coder-6.7b-Instruct', and 'CodeQwen1.5-7B-Chat' which are reasonably specific. However, GPT-3.5 used for data synthesis (a critical pipeline component) has no version specified, and GPT-4 used as a baseline has no version."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper describes what SI-CoT does and shows example interpretations (Table III) but does not provide the actual prompt text used for the CoT prompting model or the code generation prompts."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section IV-A reports: AdamW optimizer, cosine learning rate scheduler, 15 warm-up iterations, learning rate 5e-5, global batch size 256, 3 epochs, temperature settings of 0.2/0.5/0.8, and n=10 for pass@k."
    167       },
    168       "scaffolding_described": {
    169         "applies": false,
    170         "answer": false,
    171         "justification": "HAVEN uses a two-stage pipeline (CoT prompting model → CodeGen-LLM) but this is not agentic scaffolding — there are no tools, loops, retry logic, or memory management. It is a straightforward inference pipeline."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section III-C/D documents the full pipeline: ~550K Verilog samples collected from GitHub → GPT-3.5 generates vanilla instructions → parser (slang) for topic matching → exemplar-based augmentation → Verilog compiler verification → ~43K vanilla pairs, 14K K-dataset, 5K L-dataset."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "There is no dedicated limitations or threats-to-validity section. The paper moves directly from experiments (Section IV) to conclusion (Section V) without discussing limitations."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No threats to validity are discussed anywhere in the paper."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings were not tested, or what limitations apply to the generalization of results."
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The evaluation benchmarks are public, but the synthesized training datasets (43K vanilla pairs, 14K K-dataset, 5K L-dataset) and the curated exemplars are not explicitly stated as released."
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section III-C describes collecting ~550,000 Verilog code samples from public GitHub repositories, curating exemplars from textbooks, and the full data synthesis pipeline including parser-based topic matching and GPT-3.5 augmentation."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "No human participants. Data sources are public GitHub repositories and standard benchmarks."
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The pipeline is documented with counts at each stage: ~550K raw samples → vanilla instruction generation → parser matching → augmentation → compiler verification → yielding ~43K vanilla pairs, 14K K-dataset, and 5K L-dataset (Sections III-C and III-D)."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funding is disclosed: 'This work is partially supported by National Key R&D Program of China (2022YFB4500200), National Natural Science Foundation of China (No.62102257).'"
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, Zhejiang University, and one independent researcher. No commercial product affiliation conflict exists."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Funding comes from Chinese government research programs (National Key R&D Program, NSFC), which have no financial stake in the specific experimental outcomes."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No competing interests or financial interests statement is included in the paper."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The training data cutoff dates for the base models (CodeLlama, DeepSeek-Coder, CodeQwen) are not stated. The paper does not discuss when their training data ends."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No discussion of whether VerilogEval or RTLLM benchmark problems appeared in the base models' pre-training data or in the fine-tuning data collected from GitHub."
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "VerilogEval was published in 2023 and the base models were likely trained on data that could include it. No contamination analysis is performed or discussed."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study. Evaluation is entirely automated benchmark-based."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "demographics_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "blinding_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       },
    288       "attrition_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants in this study."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No inference cost, latency, or tokens consumed per example are reported. The two-stage pipeline (CoT + CodeGen) presumably doubles inference cost but this is not discussed."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "The paper mentions 'two Nvidia A100-80GB GPUs' and '3 epochs' but does not quantify total training time, GPU hours, or computational cost."
    304       }
    305     },
    306     "experimental_rigor": {
    307       "seed_sensitivity_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No multi-seed experiments. Results are reported as single point estimates without seed sensitivity analysis."
    311       },
    312       "number_of_runs_stated": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "The paper explicitly states n=10 trials per problem for the pass@k metric (Section IV-A, Equation 1)."
    316       },
    317       "hyperparameter_search_budget": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper tries temperatures 0.2, 0.5, and 0.8 and reports 'the best performance,' but does not disclose the full search budget or which temperature was selected for each model/benchmark combination."
    321       },
    322       "best_config_selection_justified": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper states 'we set the temperature of each model to 0.2, 0.5 and 0.8, reporting the best performance' (Section IV-A). This reports the maximum across three configurations without specifying which was best or using a validation set for selection."
    326       },
    327       "multiple_comparison_correction": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Many comparisons are made across models, benchmarks, and ablation settings. No statistical tests are performed at all, let alone corrections for multiple comparisons."
    331       },
    332       "self_comparison_bias_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The authors compare their HAVEN system against baselines without acknowledging the bias of evaluating one's own system. Several baselines (BetterV, AutoVCoder, ChipNeMo) are not publicly available, so results may come from the original papers under different conditions."
    336       },
    337       "compute_budget_vs_performance": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of compute differences. HAVEN uses a two-stage pipeline (CoT + CodeGen) which presumably requires more compute than single-stage baselines, but this is not compared or discussed."
    341       },
    342       "benchmark_construct_validity": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "The paper uses VerilogEval v2 because it 'focuses on human evaluation problems' but does not analyze whether pass@k on any benchmark actually measures alignment with HDL engineer practices as claimed."
    346       },
    347       "scaffold_confound_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "HAVEN adds a CoT pre-processing stage that other baselines lack, creating a scaffold confound. When comparing HAVEN (CoT + fine-tuned model) against baselines (single-stage models), the improvement could be from the scaffold rather than the model. This is partially addressed in the ablation but not when comparing against external baselines."
    351       }
    352     },
    353     "data_leakage": {
    354       "temporal_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "VerilogEval v1 was published in 2023. The base models could have been trained on data including its solutions. No temporal leakage analysis is provided."
    358       },
    359       "feature_leakage_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether the evaluation setup leaks information. The SI-CoT interpretation step provides additional context that baselines don't receive, which could constitute a form of feature advantage rather than leakage, but this asymmetry is not analyzed."
    363       },
    364       "non_independence_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "The fine-tuning data comes from ~550K GitHub Verilog samples. No analysis of whether these overlap with VerilogEval or RTLLM benchmark sources."
    368       },
    369       "leakage_detection_method": {
    370         "applies": true,
    371         "answer": false,
    372         "justification": "No leakage detection or prevention methods are used (no canary strings, membership inference, n-gram overlap, or decontamination)."
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "HAVEN-CodeQwen outperforms OriGen by 6.7% in pass@1 and 4.7% in pass@5 on VerilogEval(v1)-Human.",
    379       "evidence": "Table IV shows HAVEN-CodeQwen achieves 61.1% pass@1 and 64.8% pass@5 vs. OriGen's 54.4% pass@1 and 60.1% pass@5 on VerilogEval-Human (Section IV-B).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "HAVEN-DeepSeek achieves the best functional pass@5 on RTLLM v1.1, outperforming both OriGen and GPT-4 by 0.5%.",
    384       "evidence": "Table IV shows HAVEN-DeepSeek at 66.0% functional pass@5 on RTLLM v1.1, vs. OriGen at 65.5% and GPT-4 at 65.5%.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "SI-CoT improves pass@1 and pass@5 by 3.6% and 6.6% on average across three base models.",
    389       "evidence": "Figure 3 ablation study comparing 'vanilla' vs. 'vanilla+CoT' settings on VerilogEval(v1)-Human across CodeLlama, DeepSeek-Coder, and CodeQwen (Section IV-D).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "KL-dataset fine-tuning improves pass@1 and pass@5 by 12.3% and 8.7% on average.",
    394       "evidence": "Figure 3 ablation comparing 'vanilla' vs. 'vanilla+KL' settings across three base models on VerilogEval(v1)-Human (Section IV-D).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "HAVEN-CodeQwen achieves the highest overall pass@1 of 47.4% on symbolic modality tasks, substantially outperforming GPT-4 (22.7%) and DeepSeek-Coder-V2 (34.1%).",
    399       "evidence": "Table V shows performance on 44 curated symbolic modality tasks from VerilogEval(v1)-Human with breakdown by truth table, waveform, and state diagram (Section IV-C).",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "SI-CoT helps commercial LLMs improve on symbolic modality tasks without fine-tuning.",
    404       "evidence": "Table VI shows GPT-4o mini improves from 22.7% to 31.8%, GPT-4 from 22.7% to 34.1%, and DeepSeek-Coder-V2 from 34.1% to 45.5% with SI-CoT (Section IV-D).",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "red_flags": [
    409     {
    410       "flag": "Best-temperature cherry-picking",
    411       "detail": "The paper tries 3 temperature settings (0.2, 0.5, 0.8) and reports 'the best performance' without disclosing which temperature was selected per model/benchmark or using a validation set for selection. This inflates reported numbers relative to what a practitioner would achieve."
    412     },
    413     {
    414       "flag": "No statistical significance testing",
    415       "detail": "All 'outperforms' and 'significantly improves' claims rest on comparing point estimates without significance tests. Several claimed improvements are small (e.g., 0.5% on RTLLM functional pass@5) and could easily be within random variation."
    416     },
    417     {
    418       "flag": "No limitations section",
    419       "detail": "The paper has no limitations, threats to validity, or scope boundary discussion despite making broad claims about alignment with HDL engineer practices based solely on automated benchmarks."
    420     },
    421     {
    422       "flag": "Non-reproducible baselines",
    423       "detail": "Several baselines (ChipNeMo, BetterV, AutoVCoder) are noted as 'not publicly accessible at the time of writing,' making independent verification of comparative claims impossible."
    424     },
    425     {
    426       "flag": "Benchmark contamination risk unaddressed",
    427       "detail": "VerilogEval was published in 2023 and the fine-tuning data comes from GitHub repositories. No analysis of whether benchmark problems or solutions appear in the training data."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "CodeHalu: Investigating Code Hallucinations in LLMs via Execution-based Verification",
    433       "authors": ["Y. Tian", "W. Yan", "Q. Yang"],
    434       "year": 2024,
    435       "arxiv_id": "2405.00253",
    436       "relevance": "Proposes a hallucination taxonomy for LLM code generation, directly informing HAVEN's hallucination classification approach."
    437     },
    438     {
    439       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    440       "authors": ["F. Liu", "Y. Liu", "L. Shi"],
    441       "year": 2024,
    442       "arxiv_id": "2404.00971",
    443       "relevance": "Analyzes hallucination phenomena in LLM-based code generation (HalluCode), providing context for hallucination mitigation work."
    444     },
    445     {
    446       "title": "RTLFixer: Automatically Fixing RTL Syntax Errors with Large Language Model",
    447       "authors": ["Y. Tsai", "M. Liu", "H. Ren"],
    448       "year": 2024,
    449       "doi": "10.1145/3649329.3657353",
    450       "relevance": "Addresses syntax error fixing in LLM-generated RTL code, a related approach to improving hardware code generation quality."
    451     },
    452     {
    453       "title": "OriGen: Enhancing RTL Code Generation with Code-to-Code Augmentation and Self-Reflection",
    454       "authors": ["F. Cui", "C. Yin", "K. Zhou"],
    455       "year": 2024,
    456       "relevance": "State-of-the-art LLM-based Verilog code generation method using code augmentation, serving as HAVEN's primary baseline."
    457     },
    458     {
    459       "title": "RTLCoder: Outperforming GPT-3.5 in Design RTL Generation with Our Open-Source Dataset and Lightweight Solution",
    460       "authors": ["S. Liu", "W. Fang", "Y. Lu"],
    461       "year": 2024,
    462       "relevance": "Open-source RTL generation framework with dataset, baseline and methodological precursor for HAVEN's data augmentation approach."
    463     },
    464     {
    465       "title": "AutoVCoder: A Systematic Framework for Automated Verilog Code Generation Using LLMs",
    466       "authors": ["M. Gao", "J. Zhao", "Z. Lin"],
    467       "year": 2024,
    468       "relevance": "Systematic Verilog code generation framework fine-tuned on ~1M samples, key baseline for HAVEN's data efficiency claims."
    469     },
    470     {
    471       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    472       "authors": ["M. Liu", "N. Pinckney", "B. Khailany", "H. Ren"],
    473       "year": 2023,
    474       "relevance": "Primary benchmark used for evaluating LLM-based Verilog generation capability, including machine and human task splits."
    475     },
    476     {
    477       "title": "Revisiting VerilogEval: Newer LLMs, In-Context Learning, and Specification-to-RTL Tasks",
    478       "authors": ["N. Pinckney", "C. Batten", "M. Liu"],
    479       "year": 2024,
    480       "arxiv_id": "2408.11053",
    481       "relevance": "VerilogEval v2 benchmark extending evaluation to specification-to-RTL design tasks more aligned with HDL engineer practices."
    482     },
    483     {
    484       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    485       "authors": ["Y. Lu", "S. Liu", "Q. Zhang", "Z. Xie"],
    486       "year": 2024,
    487       "doi": "10.1109/ASP-DAC58780.2024.10473904",
    488       "relevance": "RTL-focused benchmark evaluating design generation with LLMs, used as one of three evaluation benchmarks."
    489     },
    490     {
    491       "title": "Evaluating Large Language Models Trained on Code",
    492       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    493       "year": 2021,
    494       "arxiv_id": "2107.03374",
    495       "relevance": "Foundational paper on evaluating code generation LLMs, introducing the pass@k metric used throughout HAVEN's evaluation."
    496     },
    497     {
    498       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    499       "authors": ["D. Guo"],
    500       "year": 2024,
    501       "arxiv_id": "2401.14196",
    502       "relevance": "Code-specific LLM used as one of HAVEN's base models for fine-tuning."
    503     },
    504     {
    505       "title": "BetterV: Controlled Verilog Generation with Discriminative Guidance",
    506       "authors": ["Z. Pei", "H. Zhen", "M. Yuan"],
    507       "year": 2024,
    508       "relevance": "Controlled Verilog generation approach using discriminative guidance, baseline comparison for HAVEN."
    509     },
    510     {
    511       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    512       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    513       "year": 2024,
    514       "relevance": "Examines rigor of LLM code generation evaluation, relevant to survey scope of code generation methodology."
    515     }
    516   ],
    517   "engagement_factors": {
    518     "practical_relevance": {
    519       "score": 2,
    520       "justification": "Open-source framework with code released for Verilog code generation — useful for HDL engineers and chip design teams, though the domain is niche."
    521     },
    522     "surprise_contrarian": {
    523       "score": 1,
    524       "justification": "The hallucination taxonomy for Verilog is a novel framing, but the core approach (CoT + data augmentation + fine-tuning) follows established patterns."
    525     },
    526     "fear_safety": {
    527       "score": 0,
    528       "justification": "No AI risk or safety concerns raised; focused on improving code generation correctness."
    529     },
    530     "drama_conflict": {
    531       "score": 0,
    532       "justification": "No controversy; standard academic contribution in the LLM-for-hardware space."
    533     },
    534     "demo_ability": {
    535       "score": 2,
    536       "justification": "GitHub repository is provided and models are open-source at 7B parameters, potentially runnable on consumer hardware."
    537     },
    538     "brand_recognition": {
    539       "score": 0,
    540       "justification": "From Shanghai Jiao Tong University and Zhejiang University — respected institutions but not high-profile AI labs in the public consciousness."
    541     }
    542   }
    543 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs