scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (17858B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DeepCircuitX: A Comprehensive Repository-Level Dataset for RTL Code Understanding, Generation, and PPA Analysis",
      6     "authors": [
      7       "Zeju Li",
      8       "Changran Xu",
      9       "Zhengyuan Shi",
     10       "Zedong Peng",
     11       "Yi Liu"
     12     ],
     13     "year": 2025,
     14     "venue": "2025 IEEE International Conference on LLM-Aided Design (ICLAD)",
     15     "arxiv_id": "2502.18297",
     16     "doi": "10.1109/ICLAD65226.2025.00029"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims the dataset enables improved LLM performance on RTL tasks and PPA analysis. Tables VI, VII show fine-tuning improvements, Table V shows annotation quality, and Table VIII shows PPA prediction results, supporting these claims.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims 'Fine-tuning LLMs on our dataset leads to significant performance improvements' (causal language), but the study only compares original vs. fine-tuned on their data. No comparison against fine-tuning on alternative RTL datasets is provided, so the improvement could stem from any domain-specific fine-tuning, not specifically their dataset's qualities.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims 'Comprehensive...RTL Code Understanding, Generation, and PPA Analysis' but experiments are exclusively on Verilog. VHDL and other HDLs are mentioned in the introduction but not tested. The conclusion claims to 'advance RTL-focused machine learning applications in hardware design automation' without bounding to Verilog specifically.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are discussed for any of the results. For example, improvements from fine-tuning could be due to domain adaptation in general rather than the specific dataset design choices, but this is not considered.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures BLEU/ROUGE/METEOR and frames this as 'RTL code understanding,' but these metrics measure surface-level text similarity, not actual understanding. Pass@k measures functional correctness but is framed as 'code generation' capability more broadly. The gap between proxy metrics and claimed capabilities is not discussed.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations, threats to validity, or similar section exists. The conclusion briefly mentions 'future work' but does not discuss limitations of the current work.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of potential issues with the dataset, annotation quality, or experimental methodology.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what populations or settings are excluded, or what claims are not being made. The framing is uniformly positive.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding sources are disclosed in the paper. There is no acknowledgments section listing grants or sponsors, despite the involvement of the National Center of Technology Innovation for EDA.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: CUHK, unnamed Shanghai university, Hangzhou Dianzi University, Ningbo University, Southeast University, and National Center of Technology Innovation for EDA.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed. Authors are affiliated with the National Center of Technology Innovation for EDA, which could have a stake in EDA research outcomes, but this potential conflict is not addressed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "RTL, PPA (Power-Performance-Area), CoT, and hierarchy levels (chip/IP/module/block) are all explicitly defined in context; the paper is precise about the EDA domain vocabulary.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit bullet-point contributions are listed: 4,000+ repo-level dataset, four-level organization, CoT annotation method, and pre-training/evaluation benchmarks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II systematically reviews prior EDA datasets (ISCAS, CircuitNet, RTLLM, VerilogEval) and explicitly identifies gaps (file-level only, no PPA, no cross-stage data) that DeepCircuitX addresses.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper asserts that BLEU/METEOR/ROUGE measure RTL code understanding and Pass@k measures generation capability, but never argues why these metrics are valid proxies for the claimed constructs in the hardware domain.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No difficulty tiers (easy/medium/hard) are defined or measured; the dataset distributes across chip/IP/module levels but this is a complexity hierarchy, not a difficulty characterization with empirical validation.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Several baseline models achieve 0% Pass@1 on generation tasks (CodeLLama, CodeT5+, CodeGen2 original), indicating potential floor effects, but the paper does not discuss or analyze this as a benchmark design concern.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Human evaluation covers only annotation quality (rating comments 1–4), not task performance; no human baseline is provided for code understanding, completion, or generation tasks.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "BLEU/METEOR/ROUGE are adopted without justification for RTL understanding; Pass@k is used without explaining the choice of k=1 and k=5 or why functional correctness (via EDA simulation) is not the primary metric.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "All data is collected from public GitHub repositories using keyword search; no temporal splits, canary strings, or anti-contamination measures are described, despite the fact that models like DeepSeek-Coder likely trained on the same GitHub data.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No discussion of dataset versioning, update plans, or whether benchmarks will remain discriminative as LLMs continue to improve; no temporal train/test splits are described.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The PPA prediction difficulty is noted, but no systematic analysis of what the benchmark fails to measure (e.g., functional correctness, timing closure) or how it could be gamed is provided.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The dataset is available at the gitbook URL, and baseline fine-tuning results for 6 LLM variants are reported with specific metric values on RTLLM and VerilogEval benchmarks.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Tables I–III document repository counts, file counts, functional categories, and task splits; the collection methodology using 222 keywords from Alldatasheet is described, and synthesis settings (technology libraries, EDA tools with version numbers) are specified.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The data is pointed to at https://zeju.gitbook.io/lcm-team but no licensing terms, usage restrictions, or access conditions are stated in the paper.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Intended uses (RTL understanding, generation, completion, PPA prediction) are described, but no guidance is given on what should NOT be concluded from benchmark scores or what use cases are out of scope.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "Fine-tuning on DeepCircuitX leads to significant performance improvements across all LLMs and metrics",
    203       "evidence": "Table VI shows BLEU-4 improvements (e.g., CodeGen2.5: 0.1060→13.69); Table VII shows Pass@1 improvements (e.g., CodeGen2.5: 17.24%→24.14% on RTLLM)",
    204       "supported": "moderate"
    205     },
    206     {
    207       "claim": "The dataset is adaptable across model sizes from 220M to 16B parameters",
    208       "evidence": "CodeT5+ 220M improves from BLEU-4 0.14 to 4.91; DeepSeek-Coder 16B improves from 2.24 to 11.92 — all sizes show gains but no comparison to random baselines",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "DeepCircuitX enables early-stage PPA prediction from RTL code",
    213       "evidence": "Table VIII shows MAPE for area prediction as low as 0.33 with 100% data, but delay prediction remains poor (3.48–4.74 MAPE) and test set is only 10 designs",
    214       "supported": "weak"
    215     },
    216     {
    217       "claim": "Annotation quality is high, with all metrics scoring above 3.5/4 in human evaluation",
    218       "evidence": "Table V shows Accuracy 3.74/4, Completeness 3.79/4, Clarity 3.84/4 at repo level; 3.5/4 at module level — 5 reviewers per sample but reviewer selection process not described",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "PPA prediction on practical designs (>10k cells) remains an open challenge",
    223       "evidence": "Table VIII shows that models previously evaluated on simple benchmarks perform worse on designs with >10k cells; delay prediction is particularly poor for all models",
    224       "supported": "strong"
    225     }
    226   ],
    227   "methodology_tags": [
    228     "benchmark-eval",
    229     "case-study"
    230   ],
    231   "key_findings": "DeepCircuitX provides 4,000+ repository-level RTL projects with multi-level CoT annotations and synthesized netlists/PPA metrics, addressing gaps in prior file-level-only EDA datasets. Fine-tuning any of the tested LLMs (CodeLLama, CodeT5+, CodeGen, DeepSeek variants, 220M–16B) on this dataset consistently improves RTL understanding and generation metrics over non-fine-tuned baselines. PPA prediction accuracy improves with training data volume but delay prediction remains unsatisfactory, with MAPE above 3.4 even at 100% training data on 10-design test sets. Human evaluators rated CoT annotations above 3.5/4 on accuracy, completeness, and clarity.",
    232   "red_flags": [
    233     {
    234       "flag": "No contamination analysis",
    235       "detail": "All data sourced from public GitHub repositories; no temporal splits or anti-contamination measures, yet evaluated models (DeepSeek-Coder, CodeLLama) were likely pretrained on the same GitHub corpus."
    236     },
    237     {
    238       "flag": "BLEU/ROUGE for hardware 'understanding'",
    239       "detail": "Surface n-gram overlap metrics are used to evaluate RTL code understanding without arguing why linguistic similarity to reference text indicates correct hardware comprehension."
    240     },
    241     {
    242       "flag": "PPA test set of 10 designs",
    243       "detail": "The PPA prediction experiments train on 146 designs and test on only 10, making the evaluation results statistically unreliable and likely high-variance."
    244     },
    245     {
    246       "flag": "No human baseline on benchmark tasks",
    247       "detail": "Human evaluation covers only annotation quality ratings (1–4 scale), not benchmark task performance; there is no ground truth for how human hardware engineers would score on the same tasks."
    248     },
    249     {
    250       "flag": "No funding disclosure",
    251       "detail": "Despite being a multi-institution paper with National Center of Technology Innovation for EDA co-affiliation on all primary authors, no funding sources are disclosed anywhere."
    252     },
    253     {
    254       "flag": "No licensing terms",
    255       "detail": "The dataset is hosted externally (gitbook URL) with no licensing terms stated in the paper; it is unclear whether the GitHub-scraped Verilog files are legally redistributable."
    256     }
    257   ],
    258   "cited_papers": [
    259     {
    260       "title": "RTL-Repo: A Benchmark for Evaluating LLMs on Large-Scale RTL Design Projects",
    261       "relevance": "Direct predecessor dataset collecting 1,000+ RTL repositories; DeepCircuitX extends this with multi-level annotation and PPA data"
    262     },
    263     {
    264       "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation",
    265       "relevance": "Establishes VerilogEval benchmark used as evaluation target in this paper's experiments"
    266     },
    267     {
    268       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    269       "relevance": "The other primary benchmark used for Pass@k evaluation of RTL code generation"
    270     },
    271     {
    272       "title": "CircuitNet 2.0: An Advanced Dataset for Promoting Machine Learning Innovations in Realistic Chip Design Environment",
    273       "relevance": "Key prior dataset work for physical design; DeepCircuitX contrasts by providing RTL-level data rather than post-layout data"
    274     },
    275     {
    276       "title": "Data is All You Need: Finetuning LLMs for Chip Design via an Automated Design-Data Augmentation Framework",
    277       "relevance": "Related data augmentation approach for chip design LLMs; compared and contrasted in related work"
    278     },
    279     {
    280       "title": "MasterRTL: A Pre-Synthesis PPA Estimation Framework for Any RTL Design",
    281       "relevance": "One of the three PPA prediction baselines evaluated on DeepCircuitX data"
    282     },
    283     {
    284       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    285       "relevance": "Methodological foundation for the CoT annotation approach used throughout the dataset construction"
    286     },
    287     {
    288       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    289       "relevance": "Related work on fine-tuning LLMs for Verilog generation; directly compared in experimental context"
    290     }
    291   ],
    292   "engagement_factors": {
    293     "practical_relevance": {
    294       "score": 1,
    295       "justification": "Useful for the small niche of hardware designers using LLMs for RTL code, but irrelevant to most software practitioners."
    296     },
    297     "surprise_contrarian": {
    298       "score": 0,
    299       "justification": "Results confirm expected pattern that fine-tuning on domain-specific data improves performance, with no surprising findings."
    300     },
    301     "fear_safety": {
    302       "score": 0,
    303       "justification": "No safety, security, or risk implications discussed."
    304     },
    305     "drama_conflict": {
    306       "score": 0,
    307       "justification": "No controversy or challenge to existing claims; straightforward dataset contribution paper."
    308     },
    309     "demo_ability": {
    310       "score": 1,
    311       "justification": "Dataset is available via a Gitbook page but requires significant setup for fine-tuning and synthesis tool access."
    312     },
    313     "brand_recognition": {
    314       "score": 0,
    315       "justification": "From Chinese University of Hong Kong and partner institutions, not widely recognized labs in the broader tech community."
    316     }
    317   },
    318   "hn_data": {
    319     "threads": [],
    320     "top_points": 0,
    321     "total_points": 0,
    322     "total_comments": 0
    323   }
    324 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs