scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31627B)
      1 {
      2   "paper": {
      3     "title": "DeepCircuitX: A Comprehensive Repository-Level Dataset for RTL Code Understanding, Generation, and PPA Analysis",
      4     "authors": [
      5       "Zeju Li",
      6       "Changran Xu",
      7       "Zhengyuan Shi",
      8       "Zedong Peng",
      9       "Yi Liu",
     10       "Yunhao Zhou",
     11       "Lingfeng Zhou",
     12       "Chengyu Ma",
     13       "Jianyuan Zhong",
     14       "Xi Wang",
     15       "Jieru Zhao",
     16       "Zhufei Chu",
     17       "Xiaoyan Yang",
     18       "Qiang Xu"
     19     ],
     20     "year": 2025,
     21     "venue": "2025 IEEE International Conference on LLM-Aided Design (ICLAD)",
     22     "arxiv_id": "2502.18297",
     23     "doi": "10.1109/ICLAD65226.2025.00029"
     24   },
     25   "scan_version": 3,
     26   "active_modules": [
     27     "experimental_rigor",
     28     "data_leakage"
     29   ],
     30   "methodology_tags": [
     31     "benchmark-eval"
     32   ],
     33   "key_findings": "DeepCircuitX provides a multi-level (repo/file/module/block) RTL dataset with 4,000+ projects and Chain-of-Thought annotations generated by GPT-4 and Claude. Fine-tuning LLMs on this dataset substantially improves RTL code understanding (BLEU-4 from ~0.1 to ~13.7), code generation (Pass@1 up to 24.14% on RTLLM), and PPA prediction (area MAPE down to 0.33). Delay prediction remains challenging with MAPE of 3.5-4.7 even at full training data, indicating early-stage timing estimation is an open problem.",
     34   "checklist": {
     35     "artifacts": {
     36       "code_released": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper states 'Our data is available at https://zeju.gitbook.io/lcm-team' but this refers to the dataset. No source code repository (training scripts, annotation pipeline, evaluation code) is mentioned or linked."
     40       },
     41       "data_released": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The abstract states 'Our data is available at https://zeju.gitbook.io/lcm-team' providing a URL for the dataset."
     45       },
     46       "environment_specified": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. EDA tool versions are mentioned (Synopsys Design Compiler 2019.12, PrimeTime 2023.12) but no environment details for LLM training or evaluation."
     50       },
     51       "reproduction_instructions": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are provided."
     55       }
     56     },
     57     "statistical_methodology": {
     58       "confidence_intervals_or_error_bars": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Tables VI, VII, and VIII report only point estimates. No confidence intervals, error bars, or ± notation appear anywhere in the results."
     62       },
     63       "significance_tests": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper claims models 'significantly outperform' their original counterparts (Section IV-C) but provides no statistical significance tests (no p-values, t-tests, or any formal test)."
     67       },
     68       "effect_sizes_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper reports raw metric values for original and fine-tuned models in tables but never explicitly quantifies the magnitude of improvement (no percentage improvement, Cohen's d, or relative change metrics). The text uses vague language like 'significant performance improvements' and 'substantial improvements' without quantification."
     72       },
     73       "sample_size_justified": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No justification is given for sample sizes. The PPA experiment uses 146 training and 10 test designs with no explanation for why these numbers were chosen. The human evaluation uses only 5 evaluators with no power analysis."
     77       },
     78       "variance_reported": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No standard deviation, variance, or any spread measure is reported across experimental runs in any table. All results appear to be single-run numbers."
     82       }
     83     },
     84     "evaluation_design": {
     85       "baselines_included": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Tables VI and VII compare fine-tuned models against their original (non-fine-tuned) versions. Table VII also includes CodeV (QW-7B) as an additional baseline. Table VIII compares three PPA prediction models."
     89       },
     90       "baselines_contemporary": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The baseline models include DeepSeek-Coder-V2-lite (2024), CodeV (2024), and CodeGen2.5 (2023), which are reasonably contemporary for a 2025 paper. External benchmarks RTLLM (2024) and VerilogEval (2023) are also recent."
     94       },
     95       "ablation_study": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No ablation study is presented. The dataset has multiple components (CoT annotations, multi-level structure, different annotation levels, PPA data) but no experiment isolates which components contribute to performance gains."
     99       },
    100       "multiple_metrics": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper uses BLEU-4, METEOR, ROUGE-1, ROUGE-2, ROUGE-L for code understanding (Table VI); Pass@1 and Pass@5 for code completion/generation (Table VII); and MAPE and RRSE for PPA prediction (Table VIII)."
    104       },
    105       "human_evaluation": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section IV-B describes human evaluation by 5 independent experienced engineers who grade annotation quality on accuracy, completeness, and understandable clarity on a 1-4 scale (Table V)."
    109       },
    110       "held_out_test_set": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Code completion/generation experiments use external benchmarks RTLLM and VerilogEval as test sets. PPA prediction uses a separate 10-design test set distinct from the 146 training designs."
    114       },
    115       "per_category_breakdown": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table III shows data counts broken down by IP, Module, and Chip categories. Table VII shows results on two separate benchmarks (RTLLM, VerilogEval). Table VIII breaks down PPA prediction into Area, Power, and Delay, and shows results at 10%, 50%, and 100% training data."
    119       },
    120       "failure_cases_discussed": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section IV-E discusses that PPA predictors show 'weaker performance in delay prediction' with high MAPE values (4.7 and 3.5), noting 'estimating timing characteristics in the early stages...is still difficult.' They also note models 'exhibit diminished performance on designs more than 10k cells.'"
    124       },
    125       "negative_results_reported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper honestly reports poor delay prediction results (MAPE of 4.7-80.8 in Table VIII) and acknowledges that 'how to accurately predict PPA of practical designs remains an opening question.' The PPA models perform worse on their larger designs than on simpler benchmarks."
    129       }
    130     },
    131     "claims_and_evidence": {
    132       "abstract_claims_supported": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The abstract claims the dataset enables improved LLM performance on RTL tasks and PPA analysis. Tables VI, VII show fine-tuning improvements, Table V shows annotation quality, and Table VIII shows PPA prediction results, supporting these claims."
    136       },
    137       "causal_claims_justified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper claims 'Fine-tuning LLMs on our dataset leads to significant performance improvements' (causal language), but the study only compares original vs. fine-tuned on their data. No comparison against fine-tuning on alternative RTL datasets is provided, so the improvement could stem from any domain-specific fine-tuning, not specifically their dataset's qualities."
    141       },
    142       "generalization_bounded": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The title claims 'Comprehensive...RTL Code Understanding, Generation, and PPA Analysis' but experiments are exclusively on Verilog. VHDL and other HDLs are mentioned in the introduction but not tested. The conclusion claims to 'advance RTL-focused machine learning applications in hardware design automation' without bounding to Verilog specifically."
    146       },
    147       "alternative_explanations_discussed": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No alternative explanations are discussed for any of the results. For example, improvements from fine-tuning could be due to domain adaptation in general rather than the specific dataset design choices, but this is not considered."
    151       },
    152       "proxy_outcome_distinction": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper measures BLEU/ROUGE/METEOR and frames this as 'RTL code understanding,' but these metrics measure surface-level text similarity, not actual understanding. Pass@k measures functional correctness but is framed as 'code generation' capability more broadly. The gap between proxy metrics and claimed capabilities is not discussed."
    156       }
    157     },
    158     "setup_transparency": {
    159       "model_versions_specified": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Models are identified by name and size (e.g., 'CodeLlama (7b)', 'CodeT5+ (220m-bimodal)', 'DeepSeek-V2-lite (16b)') but no exact version numbers, snapshot dates, or model IDs are provided. GPT-4 and Claude are used for annotation with no version specified."
    163       },
    164       "prompts_provided": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The CoT annotation process using GPT-4 and Claude is described at a high level (Section III-B) with the general approach (structured questions about 'What' and 'How'), but the actual prompt text is never provided. Only natural language descriptions of the prompting strategy are given."
    168       },
    169       "hyperparameters_reported": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No hyperparameters are reported for LLM fine-tuning (learning rate, epochs, batch size, optimizer) or for annotation generation (temperature, sampling settings for GPT-4/Claude). The synthesis settings mention tool versions but no ML hyperparameters."
    173       },
    174       "scaffolding_described": {
    175         "applies": false,
    176         "answer": false,
    177         "justification": "No agentic scaffolding is used. The approach involves standard LLM fine-tuning and prompted annotation generation, not agentic workflows."
    178       },
    179       "data_preprocessing_documented": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section III-A describes the data collection process using 222 keywords across chip/IP/module levels from GitHub and other sources. Section III-B details the CoT annotation pipeline at module, block, and repo levels. Tables I-III provide quantitative breakdowns of the dataset at each stage."
    183       }
    184     },
    185     "limitations_and_scope": {
    186       "limitations_section_present": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No dedicated limitations, threats to validity, or similar section exists. The conclusion briefly mentions 'future work' but does not discuss limitations of the current work."
    190       },
    191       "threats_to_validity_specific": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of potential issues with the dataset, annotation quality, or experimental methodology."
    195       },
    196       "scope_boundaries_stated": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what populations or settings are excluded, or what claims are not being made. The framing is uniformly positive."
    200       }
    201     },
    202     "data_integrity": {
    203       "raw_data_available": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The dataset is available at the provided URL (https://zeju.gitbook.io/lcm-team), which should include the RTL code, annotations, and related data for independent verification."
    207       },
    208       "data_collection_described": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Section III-A describes collection from GitHub using 222 keywords across chip-level, IP-level, and module-level designs. Table I provides counts by category (17+3+57 function categories, 4795 repos, 185K+ files). The keyword-based collection strategy is documented."
    212       },
    213       "recruitment_methods_described": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "The human evaluation (Section IV-B) mentions '5 individuals' who are 'independent experienced engineers' but provides no details on how they were recruited, their background, or selection criteria."
    217       },
    218       "data_pipeline_documented": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The pipeline from keyword search to final dataset has undocumented steps. The paper says 'over 4,000' repos and '140,000 RTL files across 77 functional categories' in the text, but Table I shows 4,795 repos and 185,809 files. No filtering criteria or counts of rejected repos are provided. The jump from raw GitHub results to curated dataset is unexplained."
    222       }
    223     },
    224     "conflicts_of_interest": {
    225       "funding_disclosed": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding sources are disclosed in the paper. There is no acknowledgments section listing grants or sponsors, despite the involvement of the National Center of Technology Innovation for EDA."
    229       },
    230       "affiliations_disclosed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Author affiliations are clearly listed: CUHK, unnamed Shanghai university, Hangzhou Dianzi University, Ningbo University, Southeast University, and National Center of Technology Innovation for EDA."
    234       },
    235       "funder_independent_of_outcome": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No funding is disclosed, so independence cannot be assessed. Authors are affiliated with the National Center of Technology Innovation for EDA, which could have a stake in EDA research outcomes, but this potential conflict is not addressed."
    239       },
    240       "financial_interests_declared": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No competing interests or financial interests statement appears in the paper."
    244       }
    245     },
    246     "contamination": {
    247       "training_cutoff_stated": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "The paper does not state training data cutoff dates for any of the base models (CodeLlama, CodeT5+, CodeGen, DeepSeek). This is relevant because these models may have seen RTLLM and VerilogEval benchmark problems during pre-training."
    251       },
    252       "train_test_overlap_discussed": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No discussion of whether the DeepCircuitX training data overlaps with the RTLLM or VerilogEval evaluation benchmarks. Both draw from publicly available Verilog code sources, making overlap plausible."
    256       },
    257       "benchmark_contamination_addressed": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "RTLLM and VerilogEval are publicly available benchmarks that pre-date the models' training data collection. No contamination analysis is performed despite the risk that base models may have encountered these problems during pre-training."
    261       }
    262     },
    263     "human_studies": {
    264       "pre_registered": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No pre-registration is mentioned for the human evaluation study."
    268       },
    269       "irb_or_ethics_approval": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No IRB or ethics board approval is mentioned for the human evaluation involving 5 engineers."
    273       },
    274       "demographics_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "The evaluators are described only as 'independent experienced engineers' with no demographics reported (years of experience, specific expertise, affiliation, etc.)."
    278       },
    279       "inclusion_exclusion_criteria": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inclusion or exclusion criteria for the 5 human evaluators are stated."
    283       },
    284       "randomization_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "The human evaluation is a rating task, not a comparative experimental study with treatment conditions requiring randomization."
    288       },
    289       "blinding_described": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "The human evaluation is a rating task for annotation quality, not a comparative study where blinding to conditions would be relevant."
    293       },
    294       "attrition_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No information on whether any evaluators dropped out or failed to complete the evaluation task."
    298       }
    299     },
    300     "cost_and_practicality": {
    301       "inference_cost_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No inference costs, API costs, or latency measurements are reported for any experiment. The annotation generation using GPT-4 and Claude must have incurred significant costs, but these are not quantified."
    305       },
    306       "compute_budget_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No GPU hours, training time, total API spend, or hardware specifications are provided for LLM fine-tuning, annotation generation, or EDA synthesis runs."
    310       }
    311     },
    312     "experimental_rigor": {
    313       "seed_sensitivity_reported": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No mention of multiple random seeds. All results appear to be from single runs with no sensitivity analysis."
    317       },
    318       "number_of_runs_stated": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they come from single or multiple runs."
    322       },
    323       "hyperparameter_search_budget": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No hyperparameter search budget is reported. Fine-tuning hyperparameters appear selected but the search process is not described."
    327       },
    328       "best_config_selection_justified": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No explanation of how model configurations or training settings were selected. Results are presented without justifying the choice of the reported configuration."
    332       },
    333       "multiple_comparison_correction": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No statistical tests are performed at all, making multiple comparison correction inapplicable."
    337       },
    338       "self_comparison_bias_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The authors evaluate their own dataset and annotation pipeline without acknowledging potential self-evaluation bias. No independent evaluation is conducted."
    342       },
    343       "compute_budget_vs_performance": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "Models range from 220M to 16B parameters with vastly different compute requirements, but no performance-vs-compute analysis is provided. Results are not compared at matched compute budgets."
    347       },
    348       "benchmark_construct_validity": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether RTLLM and VerilogEval actually measure what is claimed (RTL code generation quality), or whether BLEU/ROUGE metrics are valid measures of code understanding. Construct validity is not addressed."
    352       },
    353       "scaffold_confound_addressed": {
    354         "applies": false,
    355         "answer": false,
    356         "justification": "No scaffolding is involved in the evaluations. Models are fine-tuned and evaluated directly without agentic scaffolds."
    357       }
    358     },
    359     "data_leakage": {
    360       "temporal_leakage_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of temporal leakage. The base models may have been trained on data that includes solutions to RTLLM and VerilogEval problems, and the DeepCircuitX training data (from GitHub) may temporally overlap with evaluation benchmarks."
    364       },
    365       "feature_leakage_addressed": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No discussion of whether the evaluation setup leaks information. For example, whether the CoT annotations or dataset organization could provide hints not available in real usage scenarios."
    369       },
    370       "non_independence_addressed": {
    371         "applies": true,
    372         "answer": false,
    373         "justification": "No discussion of whether the DeepCircuitX training data shares Verilog code from the same repositories or sources as RTLLM and VerilogEval benchmarks. Both draw from publicly available Verilog code, making structural overlap plausible."
    374       },
    375       "leakage_detection_method": {
    376         "applies": true,
    377         "answer": false,
    378         "justification": "No concrete leakage detection or prevention method is applied. No decontamination, n-gram overlap analysis, or temporal splitting is performed."
    379       }
    380     }
    381   },
    382   "claims": [
    383     {
    384       "claim": "Fine-tuning LLMs on DeepCircuitX leads to significant performance improvements across all metrics for RTL code understanding.",
    385       "evidence": "Table VI shows BLEU-4 improvements from 0.08-2.24 (original) to 0.86-13.70 (fine-tuned) across all models. CodeGen2.5 improves from 0.1060 to 13.6858 BLEU-4.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "The dataset is adaptable across model sizes from 220M to 16B parameters.",
    390       "evidence": "Table VI shows CodeT5+ 220M improves to 4.9067 BLEU-4, while 7B and 16B models also show gains. Table VII shows all model sizes improve on Pass@k metrics.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Fine-tuned CodeGen2.5 outperforms CodeV, a model previously fine-tuned for Verilog generation.",
    395       "evidence": "Table VII shows CodeGen2.5 (7b) achieves 24.14% Pass@1 on RTLLM vs CodeV's 14.80%, and 24.36% vs 4.5% on VerilogEval.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "PPA prediction accuracy improves with increased training data volume.",
    400       "evidence": "Table VIII shows area MAPE decreasing from 4.32 (10% data) to 0.33 (100% data) for MasterRTL, with similar trends for other models and metrics.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Accurate early-stage delay prediction remains an open challenge.",
    405       "evidence": "Table VIII shows delay MAPE of 4.74 (SNS) and 3.48 (MasterRTL) at 100% data, far worse than area (0.66, 0.33) and power (0.75, 0.65) predictions.",
    406       "supported": "strong"
    407     },
    408     {
    409       "claim": "Human evaluation confirms annotation quality above 3.5/4 across all metrics.",
    410       "evidence": "Table V shows accuracy 3.74/4 (repo) and 3.5/4 (module), completeness 3.79/4 and 3.78/4, clarity 3.84/4 and 3.76/4. Based on 5 evaluators.",
    411       "supported": "weak"
    412     }
    413   ],
    414   "red_flags": [
    415     {
    416       "flag": "No error bars or variance across runs",
    417       "detail": "All experimental results in Tables VI, VII, and VIII are single point estimates with no uncertainty quantification. The paper claims 'significant' improvements without any statistical significance testing."
    418     },
    419     {
    420       "flag": "No ablation study for dataset components",
    421       "detail": "The dataset has multiple design choices (CoT annotations, multi-level structure, different annotation levels) but no experiment isolates which components drive the observed improvements. The reader cannot determine whether the gains come from the annotations, the data volume, the multi-level structure, or simply domain-specific fine-tuning."
    422     },
    423     {
    424       "flag": "Potential train-test data leakage",
    425       "detail": "Both DeepCircuitX and the evaluation benchmarks (RTLLM, VerilogEval) draw from publicly available Verilog code sources. No analysis of overlap between training and test data is conducted."
    426     },
    427     {
    428       "flag": "Tiny human evaluation sample",
    429       "detail": "Only 5 evaluators assessed annotation quality with no reported demographics, recruitment criteria, or inter-rater reliability measures. The number of annotation samples evaluated is not specified."
    430     },
    431     {
    432       "flag": "No limitations section",
    433       "detail": "The paper has no limitations, threats to validity, or scope boundaries discussion. All framing is uniformly positive despite significant methodological gaps."
    434     },
    435     {
    436       "flag": "Inconsistent dataset statistics",
    437       "detail": "The paper text states 'over 4,000 repository-level RTL projects' and '140,000 RTL files across 77 functional categories,' but Table I sums to 4,795 repos and 185,809 files. These discrepancies are not explained."
    438     },
    439     {
    440       "flag": "Missing training details",
    441       "detail": "No hyperparameters, training configuration, hardware details, or compute budgets are reported for any of the fine-tuning experiments, making reproduction impossible."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "RTL-Repo: A Benchmark for Evaluating LLMs on Large-Scale RTL Design Projects",
    447       "authors": [
    448         "A. Allam",
    449         "M. Shalan"
    450       ],
    451       "year": 2024,
    452       "arxiv_id": "2405.17378",
    453       "relevance": "Benchmark for evaluating LLMs on RTL design projects, directly comparable to DeepCircuitX's goals."
    454     },
    455     {
    456       "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation",
    457       "authors": [
    458         "S. Thakur",
    459         "B. Ahmad",
    460         "Z. Fan",
    461         "H. Pearce",
    462         "B. Tan",
    463         "R. Karri",
    464         "B. Dolan-Gavitt",
    465         "S. Garg"
    466       ],
    467       "year": 2023,
    468       "relevance": "Benchmark evaluation of LLMs for Verilog code generation, one of the foundational papers in LLM-for-hardware."
    469     },
    470     {
    471       "title": "Data is All You Need: Finetuning LLMs for Chip Design via an Automated Design-Data Augmentation Framework",
    472       "authors": [
    473         "K. Chang",
    474         "K. Wang",
    475         "N. Yang"
    476       ],
    477       "year": 2024,
    478       "arxiv_id": "2403.11202",
    479       "relevance": "Data augmentation framework for chip design LLMs, addressing the same data quality challenge as DeepCircuitX."
    480     },
    481     {
    482       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    483       "authors": [
    484         "Y. Lu",
    485         "S. Liu",
    486         "Q. Zhang",
    487         "Z. Xie"
    488       ],
    489       "year": 2024,
    490       "relevance": "One of the two evaluation benchmarks used in this paper for RTL code generation with LLMs."
    491     },
    492     {
    493       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    494       "authors": [
    495         "M. Liu",
    496         "N. Pinckney",
    497         "B. Khailany",
    498         "H. Ren"
    499       ],
    500       "year": 2023,
    501       "relevance": "The other evaluation benchmark used in this paper for Verilog code generation evaluation."
    502     },
    503     {
    504       "title": "Code Llama: Open Foundation Models for Code",
    505       "authors": [
    506         "B. Roziere",
    507         "J. Gehring"
    508       ],
    509       "year": 2023,
    510       "arxiv_id": "2308.12950",
    511       "relevance": "Open-source code LLM used as one of the base models fine-tuned on DeepCircuitX."
    512     },
    513     {
    514       "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation",
    515       "authors": [
    516         "Y. Wang",
    517         "H. Le",
    518         "A. D. Gotmare"
    519       ],
    520       "year": 2023,
    521       "arxiv_id": "2305.07922",
    522       "relevance": "Code understanding and generation LLM used as a base model in the fine-tuning experiments."
    523     },
    524     {
    525       "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence",
    526       "authors": [
    527         "Q. Zhu",
    528         "D. Guo",
    529         "Z. Shao"
    530       ],
    531       "year": 2024,
    532       "arxiv_id": "2406.11931",
    533       "relevance": "Code intelligence LLM that showed strong baseline performance on RTL tasks even before fine-tuning."
    534     },
    535     {
    536       "title": "CodeV: Empowering LLMs for Verilog Generation through Multi-Level Summarization",
    537       "authors": [
    538         "Y. Zhao",
    539         "D. Huang",
    540         "C. Li"
    541       ],
    542       "year": 2024,
    543       "arxiv_id": "2407.10424",
    544       "relevance": "LLM specifically fine-tuned for Verilog generation, used as a comparison baseline."
    545     },
    546     {
    547       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    548       "authors": [
    549         "S. Thakur",
    550         "B. Ahmad",
    551         "H. Pearce"
    552       ],
    553       "year": 2024,
    554       "relevance": "LLM fine-tuned on Verilog datasets, representing the prior state of LLM-based hardware code generation."
    555     },
    556     {
    557       "title": "MG-Verilog: Multi-Grained Dataset Towards Enhanced LLM-Assisted Verilog Generation",
    558       "authors": [
    559         "Y. Zhang",
    560         "Z. Yu",
    561         "Y. Fu"
    562       ],
    563       "year": 2024,
    564       "arxiv_id": "2407.01910",
    565       "relevance": "Multi-grained Verilog dataset for LLM training, a direct competitor/comparable dataset to DeepCircuitX."
    566     },
    567     {
    568       "title": "Origen: Enhancing RTL Code Generation with Code-to-Code Augmentation and Self-Reflection",
    569       "authors": [
    570         "F. Cui",
    571         "C. Yin",
    572         "K. Zhou"
    573       ],
    574       "year": 2024,
    575       "arxiv_id": "2407.16237",
    576       "relevance": "RTL code generation approach using augmentation and self-reflection, relevant to LLM-based hardware design."
    577     }
    578   ],
    579   "engagement_factors": {
    580     "practical_relevance": {
    581       "score": 1,
    582       "justification": "Useful for the small niche of hardware designers using LLMs for RTL code, but irrelevant to most software practitioners."
    583     },
    584     "surprise_contrarian": {
    585       "score": 0,
    586       "justification": "Results confirm expected pattern that fine-tuning on domain-specific data improves performance, with no surprising findings."
    587     },
    588     "fear_safety": {
    589       "score": 0,
    590       "justification": "No safety, security, or risk implications discussed."
    591     },
    592     "drama_conflict": {
    593       "score": 0,
    594       "justification": "No controversy or challenge to existing claims; straightforward dataset contribution paper."
    595     },
    596     "demo_ability": {
    597       "score": 1,
    598       "justification": "Dataset is available via a Gitbook page but requires significant setup for fine-tuning and synthesis tool access."
    599     },
    600     "brand_recognition": {
    601       "score": 0,
    602       "justification": "From Chinese University of Hong Kong and partner institutions, not widely recognized labs in the broader tech community."
    603     }
    604   }
    605 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs