scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28498B)
      1 {
      2   "paper": {
      3     "title": "OpenLLM-RTL: Open Dataset and Benchmark for LLM-Aided Design RTL Generation",
      4     "authors": [
      5       "Shang Liu",
      6       "Yao Lu",
      7       "Wenji Fang",
      8       "Mengming Li",
      9       "Zhiyao Xie"
     10     ],
     11     "year": 2024,
     12     "venue": "ICCAD '24 (ACM/IEEE International Conference on Computer-Aided Design)",
     13     "arxiv_id": "2503.15112",
     14     "doi": "10.1145/3676536.3697118"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "OpenLLM-RTL integrates three open-source contributions: RTLLM 2.0 (50 hand-crafted RTL designs for benchmarking), AssertEval (18 designs for assertion-based verification evaluation), and RTLCoder-Data (80K raw + 7K verified instruction-code samples). A DeepSeek-Coder-6.7B model fine-tuned on the 80K dataset outperforms GPT-3.5 and matches GPT-4 on VerilogEval. Critically, 7K verified samples outperform 27K-50K raw samples on most metrics, demonstrating that assertion-based data quality verification substantially improves training efficiency.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Three GitHub repositories are provided: RTLLM 2.0 at github.com/hkust-zhiyao/RTLLM, AssertEval at github.com/hkust-zhiyao/AssertLLM, and RTLCoder-Data at github.com/hkust-zhiyao/RTL-Coder (Section 2, 3, 4 footnotes)."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All three benchmarks and datasets are open-sourced: RTLLM 2.0 (50 designs with descriptions, testbenches, and reference RTL), AssertEval (18 designs with specifications and golden RTL), and RTLCoder-Data (80K raw + 7K verified samples)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions hardware (4x RTX 4090 24GB) and DeepSpeed stage-2, but provides no requirements.txt, Dockerfile, or detailed dependency listing with library versions. Software environment is not specified beyond framework names."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level (Section 4.1, 5.1) but there are no runnable commands, scripts, or README-level instructions for reproducing the experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Table 5 are point estimates (e.g., '64.7%' pass@1) with no confidence intervals, error bars, or uncertainty measures."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Claims like 'outperforms GPT4 by an absolute value of 4.7%' (Section 5.3) are made by comparing point estimates with no statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Absolute and relative differences are reported with baseline context, e.g., 'performance on the Eval-Machine pass@1 metric rises from 53.7% to 64.7%' as data increases from 5K to 80K (Section 5.3). Table 5 provides both system and baseline scores."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for benchmark sizes (50 designs in RTLLM 2.0, 18 in AssertEval). The expansion from 30 to 50 designs is motivated by 'more thorough evaluation' but no quantitative rationale is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance or standard deviation is reported across experimental runs. Three temperature conditions are tested and only the best is reported: 'we evaluate all 3 temperature conditions {0.2, 0.5, 0.8} and report the best performance for each model' (Section 5.1)."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Table 5 includes extensive baselines: commercial (GPT-3.5, GPT-4), closed-source (ChipNeMo, VerilogEval, BetterV), and open-source (Codegen2, Starcoder, Thakur et al., Mistral-7B, DeepSeek-Coder)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include 2024 models: BetterV, DeepSeek-Coder, GPT-4. These are contemporary and competitive at the time of publication."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 5.3 systematically studies impact of training data amount (5K/27K/50K/80K), training scheme (direct vs scoring-based), and data quality (raw vs verified 7K). These are ablation-style studies isolating individual factors."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are used: pass@1, pass@5, pass@10 on VerilogEval (Eval-Machine and Eval-Human), and Syntax-VCS and Func on RTLLM V1.1 (Table 5)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated: synthesis for syntax correctness, testbench simulation for functionality, and PPA measurement for design quality. No human evaluation of generated RTL quality is performed."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "VerilogEval and RTLLM are used as independent test benchmarks. Training data overlap is checked via Rouge-L similarity (Figure 4a), and samples with Rouge-L > 0.5 are removed during training (Section 5.2)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "RTLLM 2.0 categorizes designs into Arithmetic, Memory, Control, and Miscellaneous modules (Table 2). AssertEval categorizes by design type: cryptographic, processor, arithmetic, communication, memory (Table 3). However, results in Table 5 are aggregated, not broken down by category."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No failure cases or error analysis is presented. The paper does not discuss where models fail, what types of designs are hardest, or show example generation failures."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "Every experiment shows improvement. No failed approaches, unsuccessful configurations, or negative results are reported."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims about benchmark sizes (50 designs, 18 designs, 80K/7K samples), the verification-based data quality method, and LLM performance improvement are all supported by results in Sections 2-5 and Table 5."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims like 'enlarging the training dataset... significantly boost the model's code generation capabilities' are supported by controlled experiments varying data amount (5K→80K) while holding other factors constant (same base model, same training procedure). The study design with single-variable manipulation is adequate."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims general 'LLM-Aided Design RTL Generation' but all experiments are on Verilog only. The paper mentions VHDL and Chisel support (Section 2.1) but provides no non-Verilog results. Generalization to other HDLs is not bounded."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations are discussed for the observed results. For example, the 7K verified dataset's superiority could partly reflect bias toward simpler designs (verification is easier for simple circuits), but this is not explored."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 2.3.2 explicitly distinguishes three progressive goals: syntax, functionality, and quality (PPA). The paper acknowledges that 'While the testbench samples a reasonable number of cases, passing them doesn't guarantee 100% functionality correctness,' recognizing the proxy gap between benchmark pass rates and actual design correctness."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Fine-tuned models specify versions (Mistral-7B-v0.1, DeepSeek-Coder-6.7b-Instruct), but commercial baselines are listed only as 'GPT-3.5' and 'GPT4' without API versions or snapshot dates (Table 5). Model behavior varies across versions."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The data generation flow (Figure 3) describes prompt types at a high level (keyword-based, code-based, mutation) but actual prompt text is not provided. Evaluation prompts (design descriptions) are provided as part of the benchmark, but the training data generation prompts are described only conceptually."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.1 reports: Adam optimizer with β1=0.9, β2=0.999, lr=1e-5, no weight decay, context length 2048, global batch size 256, DeepSpeed stage-2. Inference temperatures {0.2, 0.5, 0.8} tested."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. Models are directly prompted or fine-tuned for single-turn generation."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 4.1 describes the 3-stage data generation pipeline in detail (Figure 3): keyword preparation, instruction generation with mutation and checking, reference code generation. Section 5.2 documents Rouge-L filtering (>0.5 removed). Diversity analysis is provided (Table 4)."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Limitation and Challenges' provides substantive discussion of challenges in both RTL generation benchmarking and assertion generation, spanning design complexity, description detail, data leakage, and assertion quality."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 discusses specific threats: (1) overly complex designs fail to differentiate models, (2) description detail level affects whether the task is generation or translation, (3) Rouge-L 'may not be perfect' for leakage detection, (4) pre-training leakage is uncontrollable, (5) assertion quality depends on specification richness."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to Verilog-only, does not state what design complexity levels are excluded, and does not specify what types of RTL generation tasks are outside scope."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "All datasets and benchmarks are open-sourced via GitHub (RTLLM 2.0, AssertEval, RTLCoder-Data with both 80K raw and 7K verified samples). Raw training data is available for verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4.1 describes the 3-stage automated data generation flow in detail: keyword preparation from GPT, instruction generation with mutation, and reference code generation. The process is clearly documented with Figure 3."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data is synthetically generated via LLM prompting and collected from public Verilog sources. Benchmarks are hand-crafted by the authors."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Figure 3 illustrates the full pipeline from keywords to final datasets. Section 4.2 explains the 80K raw dataset generation (relaxed diversity checking, enlarged source pool). Section 4.3 explains the 7K verified dataset (syntax + functionality checking). Token distributions shown in Figure 4b."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source or acknowledgments section is present in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are listed as affiliated with Hong Kong University of Science and Technology (HKUST), clearly stated in the paper header."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence cannot be assessed. The paper evaluates the authors' own benchmarks and datasets (RTLLM, RTLCoder), creating a potential self-evaluation conflict."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial disclosure statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any model. GPT-3.5, GPT-4, Mistral, DeepSeek-Coder, and other baselines are used without specifying when their pre-training data was collected."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Section 5.2 computes Rouge-L similarity between training data and test benchmarks (Figure 4a), finding most samples have low overlap (~0.25). Samples with Rouge-L > 0.5 are removed. However, this only covers fine-tuning data, not pre-training overlap."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Section 6 acknowledges that 'leakage during the LLM pre-training process is difficult to control' and calls it 'a challenging open problem,' but no concrete measures are taken to address whether VerilogEval or RTLLM benchmarks appeared in pre-training data of commercial models."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference costs, API costs, or latency metrics are reported for any evaluated model."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Hardware is mentioned (4x RTX 4090 GPUs) but total training time, GPU hours, or computational budget are not quantified. Only relative comparisons are given: '7K verified uses <20% of the training time of 50K' (Section 5.3)."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single training runs."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of training runs is not stated. For RTLLM evaluation, 5 trials (pass@5) are used, but it is unclear how many independent training runs produced each model."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Three temperature settings {0.2, 0.5, 0.8} are tested at inference time with best reported, but no hyperparameter search budget for training is described. The choice of learning rate, batch size, etc. appears fixed without justification."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Section 5.1 states 'we evaluate all 3 temperature conditions {0.2, 0.5, 0.8} and report the best performance for each model.' The best temperature per model is selected without justification for why or showing all configurations' results."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate models on their own benchmarks (RTLLM, RTLLM 2.0) and compare against their own prior work (RTLCoder). This self-evaluation bias is not acknowledged or discussed."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Models of different sizes (6.7B-16B) and different training compute are compared without matching or reporting compute budgets. The relative training efficiency of 7K verified vs 50K raw is noted qualitatively but not quantified."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Section 2.3.2 defines syntax, functionality, and quality goals, but does not discuss whether pass@k on 50 hand-crafted designs measures actual RTL generation capability. Section 6 raises questions about design complexity and description detail without resolving them."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. Models are evaluated via direct generation."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of temporal aspects. VerilogEval was published in 2023; models trained after 2023 may have seen its solutions. This is not addressed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information. The design descriptions include module names and I/O signal specifications, which could match training data patterns."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": true,
    359         "justification": "Section 5.2 computes Rouge-L similarity between training samples and benchmark test cases (Figure 4a), finding most have low similarity (~0.25). Samples with Rouge-L > 0.5 are filtered out to reduce overlap."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Rouge-L similarity calculation is used as a concrete leakage detection method (Section 5.2, Figure 4a). Training samples with Rouge-L > 0.5 against benchmark test cases are removed."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "DeepSeek-Direct (80K) outperforms GPT-3.5 and all non-commercial baselines on all metrics, and outperforms GPT-4 on VerilogEval Eval-Machine pass@1 by 4.7%",
    371       "evidence": "Table 5 shows DeepSeek-Direct (80K) achieves 64.7% pass@1 on Eval-Machine vs GPT-4's 60.0%. It exceeds GPT-3.5 (46.7%) and all open-source baselines across metrics.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Larger training datasets consistently improve model performance with no saturation at 80K samples",
    376       "evidence": "Table 5 and Figure 5 show monotonic improvement from 5K (53.7%) to 27K (59.8%) to 50K (62.6%) to 80K (64.7%) on Eval-Machine pass@1. Section 5.3 states 'even with 80K data samples, there are still no signs of model performance saturation.'",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "7K verified dataset outperforms 27K raw dataset on all metrics and 50K raw dataset on 6/8 metrics",
    381       "evidence": "Table 5 shows DeepSeek-Direct (7K verified) at 61.3% pass@1 vs DeepSeek-Direct (27K) at 59.8% and DeepSeek-Direct (50K) at 62.6% on Eval-Machine. On Eval-Human pass@5/10 and RTLLM Syntax, 7K verified exceeds 50K raw.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Scoring-based training scheme improves over basic direct training",
    386       "evidence": "Table 5 shows Mistral-Scoring and DeepSeek-Scoring outperform their Direct counterparts on all metrics with the same 27K data. E.g., DeepSeek-Scoring 61.2% vs DeepSeek-Direct 59.8% on Eval-Machine pass@1.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "RTLCoder-Data exhibits satisfactory diversity compared with other open-source Verilog datasets",
    391       "evidence": "Table 4 shows lower CR (4.21) and CR:POS (7.33) for RTLCoder-Data Raw compared to MG-Verilog (5.80/9.16) and Goh et al. (5.27/10.1), where lower values indicate higher diversity.",
    392       "supported": "strong"
    393     }
    394   ],
    395   "red_flags": [
    396     {
    397       "flag": "Best-of-temperatures reporting inflates results",
    398       "detail": "Section 5.1 states 'we evaluate all 3 temperature conditions {0.2, 0.5, 0.8} and report the best performance for each model.' This cherry-picks the best configuration without showing variance or all results, inflating reported performance."
    399     },
    400     {
    401       "flag": "No error bars or variance on any result",
    402       "detail": "All results in Table 5 are single point estimates with no confidence intervals, standard deviations, or variance across runs. Given that LLM generation is stochastic, single-run results are unreliable."
    403     },
    404     {
    405       "flag": "Self-evaluation bias: authors benchmark on own dataset",
    406       "detail": "The authors evaluate their fine-tuned models on RTLLM, which they also authored (reference [29]). The benchmark design may favor their training data distribution. This conflict is not acknowledged."
    407     },
    408     {
    409       "flag": "Pre-training contamination unaddressed for baselines",
    410       "detail": "Commercial models (GPT-3.5, GPT-4) and open-source models may have seen VerilogEval or RTLLM benchmark problems during pre-training. Only fine-tuning overlap is checked via Rouge-L, not pre-training contamination."
    411     }
    412   ],
    413   "cited_papers": [
    414     {
    415       "title": "RTLCoder: Outperforming GPT-3.5 in Design RTL Generation with Our Open-Source Dataset and Lightweight Solution",
    416       "authors": ["Shang Liu", "Wenji Fang", "Yao Lu", "Qijun Zhang", "Hongce Zhang", "Zhiyao Xie"],
    417       "year": 2023,
    418       "arxiv_id": "2312.08617",
    419       "relevance": "Prior work proposing the RTLCoder dataset and scoring-based training scheme for LLM-based RTL code generation."
    420     },
    421     {
    422       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    423       "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"],
    424       "year": 2023,
    425       "arxiv_id": "2309.07544",
    426       "relevance": "Key benchmark for evaluating LLM Verilog generation capability, used as primary evaluation in this paper."
    427     },
    428     {
    429       "title": "ChipNeMo: Domain-Adapted LLMs for Chip Design",
    430       "authors": ["Mingjie Liu", "Teodor-Dumitru Ene", "Robert Kirby"],
    431       "year": 2023,
    432       "arxiv_id": "2311.00176",
    433       "relevance": "Domain-adapted LLM for chip design using closed-source training data, evaluated as baseline."
    434     },
    435     {
    436       "title": "BetterV: Controlled Verilog Generation with Discriminative Guidance",
    437       "authors": ["Zehua Pei", "Hui-Ling Zhen", "Mingxuan Yuan", "Yu Huang", "Bei Yu"],
    438       "year": 2024,
    439       "arxiv_id": "2402.03375",
    440       "relevance": "Contemporary LLM-based Verilog generation approach achieving strong results on VerilogEval."
    441     },
    442     {
    443       "title": "MG-Verilog: Multi-grained Dataset Towards Enhanced LLM-assisted Verilog Generation",
    444       "authors": ["Yongan Zhang", "Zhongzhi Yu"],
    445       "year": 2024,
    446       "arxiv_id": "2407.01910",
    447       "relevance": "Open-source multi-grained Verilog dataset for LLM training, compared for diversity metrics."
    448     },
    449     {
    450       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    451       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    452       "year": 2024,
    453       "arxiv_id": "2401.14196",
    454       "relevance": "Base model used for fine-tuning in RTLCoder experiments, representative of code-specialized LLMs."
    455     },
    456     {
    457       "title": "AssertLLM: Generating and Evaluating Hardware Verification Assertions from Design Specifications via Multi-LLMs",
    458       "authors": ["Wenji Fang", "Mengming Li", "Min Li"],
    459       "year": 2024,
    460       "arxiv_id": "2402.00386",
    461       "relevance": "LLM-based assertion generation method whose techniques are used for the verification-based data quality checking in RTLCoder-Data."
    462     },
    463     {
    464       "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation",
    465       "authors": ["Shailja Thakur", "Baleegh Ahmad", "Zhenxing Fan", "Hammond Pearce"],
    466       "year": 2023,
    467       "relevance": "Early benchmark paper for LLM Verilog generation, provides code-only open-source dataset used in data generation."
    468     },
    469     {
    470       "title": "CodeV: Empowering LLMs for Verilog Generation through Multi-Level Summarization",
    471       "authors": ["Yang Zhao", "Di Huang"],
    472       "year": 2024,
    473       "arxiv_id": "2407.10424",
    474       "relevance": "LLM approach for Verilog generation using multi-level code summarization with closed-source data."
    475     },
    476     {
    477       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    478       "authors": ["Yao Lu", "Shang Liu", "Qijun Zhang", "Zhiyao Xie"],
    479       "year": 2023,
    480       "arxiv_id": "2308.05345",
    481       "relevance": "Original RTLLM benchmark (30 designs) that this paper extends to RTLLM 2.0 (50 designs)."
    482     },
    483     {
    484       "title": "OriGen: Enhancing RTL Code Generation with Code-to-Code Augmentation and Self-Reflection",
    485       "authors": ["Fan Cui", "Chenyang Yin"],
    486       "year": 2024,
    487       "arxiv_id": "2407.16237",
    488       "relevance": "RTL code generation approach using data augmentation and self-reflection techniques."
    489     },
    490     {
    491       "title": "GPT-4 Technical Report",
    492       "authors": ["OpenAI"],
    493       "year": 2023,
    494       "arxiv_id": "2303.08774",
    495       "relevance": "Foundation model used as commercial baseline for RTL generation evaluation."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs