scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19926B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DeepCircuitX: A Comprehensive Repository-Level Dataset for RTL Code Understanding, Generation, and PPA Analysis",
      6     "authors": [
      7       "Zeju Li",
      8       "Changran Xu",
      9       "Zhengyuan Shi",
     10       "Zedong Peng",
     11       "Yi Liu",
     12       "Yunhao Zhou",
     13       "Lingfeng Zhou",
     14       "Chengyu Ma",
     15       "Jianyuan Zhong",
     16       "Xi Wang",
     17       "Jieru Zhao",
     18       "Zhufei Chu",
     19       "Xiaoyan Yang",
     20       "Qiang Xu"
     21     ],
     22     "year": 2025,
     23     "venue": "2025 IEEE International Conference on LLM-Aided Design (ICLAD)",
     24     "arxiv_id": "2502.18297",
     25     "doi": "10.1109/ICLAD65226.2025.00029"
     26   },
     27   "checklist": {
     28     "claims_and_evidence": {
     29       "abstract_claims_supported": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The abstract's core claims — 4,000+ repo-level RTL projects (Table I), multi-level CoT annotations, fine-tuning effectiveness (Tables VI–VII), and human quality evaluation (Table V, all >3.5/4) — are all supported by the paper's content.",
     33         "source": "haiku"
     34       },
     35       "causal_claims_justified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper makes causal claims that 'fine-tuning LLMs on our dataset leads to significant performance improvements' but only compares fine-tuned vs. non-fine-tuned versions of the same models; no ablation isolates whether gains come from the CoT annotations, repository-level structure, data volume, or domain specificity.",
     39         "source": "haiku"
     40       },
     41       "generalization_bounded": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The conclusion broadly claims DeepCircuitX 'establishes new benchmarks for RTL tasks' and will 'transform this critical domain,' but the PPA prediction experiment uses only 10 test designs and coverage of EDA tasks beyond understanding/generation/completion is not demonstrated.",
     45         "source": "haiku"
     46       },
     47       "alternative_explanations_discussed": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No alternative explanations are offered for observed performance gains; the paper does not consider whether gains stem simply from increased domain-specific training volume rather than the distinctive repository-level or CoT properties of the dataset.",
     51         "source": "haiku"
     52       },
     53       "proxy_outcome_distinction": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "BLEU/METEOR/ROUGE are used to evaluate 'RTL code understanding' without acknowledging that these measure surface linguistic similarity, not semantic or functional understanding; the paper does not distinguish between the proxy metric and the claimed capability.",
     57         "source": "haiku"
     58       }
     59     },
     60     "limitations_and_scope": {
     61       "limitations_section_present": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "There is no dedicated limitations or threats-to-validity section; the only acknowledgment of shortcomings is one sentence in the PPA discussion noting that delay prediction is an open problem.",
     65         "source": "haiku"
     66       },
     67       "threats_to_validity_specific": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No threats to validity are discussed; notable concerns such as data contamination (GitHub repos in LLM pretraining), the tiny PPA test set (n=10), or evaluator independence in human evaluation are not mentioned.",
     71         "source": "haiku"
     72       },
     73       "scope_boundaries_stated": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No explicit scope boundaries are stated; the paper does not specify what the benchmark results cannot show or where the dataset would fail to generalize (e.g., proprietary EDA flows, non-Verilog HDLs, advanced process nodes beyond those tested).",
     77         "source": "haiku"
     78       }
     79     },
     80     "conflicts_of_interest": {
     81       "funding_disclosed": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No funding disclosure appears anywhere in the paper text; the National Center of Technology Innovation for EDA affiliation is listed but no grants or sponsors are acknowledged.",
     85         "source": "haiku"
     86       },
     87       "affiliations_disclosed": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "All author affiliations are clearly listed in the header (CUHK, SJTU, Hangzhou Dianzi University, Ningbo University, Southeast University, and the National Center of Technology Innovation for EDA).",
     91         "source": "haiku"
     92       },
     93       "funder_independent_of_outcome": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No funding is disclosed, so this criterion is not applicable.",
     97         "source": "haiku"
     98       },
     99       "financial_interests_declared": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No competing interests statement, no declaration of patents, equity, or consulting relationships appears in the paper.",
    103         "source": "haiku"
    104       }
    105     },
    106     "scope_and_framing": {
    107       "key_terms_defined": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "RTL (Register Transfer Level), PPA (Power, Performance, Area), CoT (Chain of Thought), and the hierarchy of design levels (chip, IP, module, block) are all defined explicitly in the introduction and dataset sections.",
    111         "source": "haiku"
    112       },
    113       "intended_contribution_clear": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper lists four explicit contributions: a repository-level dataset of 4,000+ RTL projects, four-level organization, CoT annotation methodology, and pre-training/evaluation benchmarks for RTL and PPA tasks.",
    117         "source": "haiku"
    118       },
    119       "engagement_with_prior_work": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Section II provides a structured comparison with prior EDA datasets (CircuitNet, ISCAS, RTL-Repo, RTLLM, VerilogEval) and articulates specific gaps that DeepCircuitX addresses (file-level only, no PPA data, no CoT annotations).",
    123         "source": "haiku"
    124       }
    125     }
    126   },
    127   "type_checklist": {
    128     "benchmark-creation": {
    129       "construct_design": {
    130         "construct_validity_argued": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The paper argues that repository-level structure is needed for comprehensive RTL modeling but does not formally argue why BLEU/METEOR/ROUGE measure RTL understanding or why Pass@k adequately captures generation quality beyond syntactic correctness.",
    134           "source": "haiku"
    135         },
    136         "difficulty_distribution_characterized": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No difficulty tiers are defined or measured; the benchmark tasks are described by category (IP/Module/Chip) and count but not by difficulty level, and no analysis of item difficulty distribution is provided.",
    140           "source": "haiku"
    141         },
    142         "ceiling_floor_effects_checked": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "Several base models score 0% Pass@k (floor effect) and this is not discussed as a benchmark design concern; the paper presents it as evidence of effectiveness rather than a signal that the benchmark may be miscalibrated for those models.",
    146           "source": "haiku"
    147         },
    148         "human_baseline_included": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Human evaluation is conducted only to rate annotation quality (accuracy, completeness, clarity on a 1–4 scale), not to establish how humans perform on the RTL understanding, completion, or generation benchmark tasks.",
    152           "source": "haiku"
    153         },
    154         "scoring_rubric_justified": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "BLEU/METEOR/ROUGE and MAPE/RRSE are described but not justified as the right metrics for RTL-specific tasks; no discussion of edge cases in scoring (e.g., functionally correct but syntactically different code) is provided.",
    158           "source": "haiku"
    159         }
    160       },
    161       "robustness": {
    162         "contamination_resistance_designed": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No contamination-resistance measures are implemented; the dataset is collected from GitHub, which is in the pretraining corpora of all evaluated LLMs (CodeLlama, CodeT5+, DeepSeek), and no temporal split or canary mechanism is used.",
    166           "source": "haiku"
    167         },
    168         "temporal_robustness_discussed": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No discussion of how the benchmark will remain useful as LLMs improve or become more capable at RTL code; no versioning or update plan is mentioned.",
    172           "source": "haiku"
    173         },
    174         "failure_modes_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "PPA delay prediction failure is noted in passing, but systematic failure modes of the benchmark itself (what it cannot measure, how it could be gamed, what it conflates) are not discussed.",
    178           "source": "haiku"
    179         },
    180         "baseline_implementations_provided": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "Fine-tuned model weights and training code are not mentioned as available; the dataset URL (a gitbook page) is given, but there is no mention of releasing the evaluation harness or fine-tuned checkpoints needed to reproduce reported numbers.",
    184           "source": "haiku"
    185         }
    186       },
    187       "documentation": {
    188         "dataset_documentation_complete": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "The paper describes data collection methodology and structure but provides no formal data card, no explicit train/validation/test splits for the benchmark tasks, and no description of quality filtering beyond functional correctness implied by synthesis.",
    192           "source": "haiku"
    193         },
    194         "licensing_and_access_clear": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "The dataset is listed as available at a gitbook URL but no license is specified; terms of use, redistribution rights, and whether the GitHub-sourced RTL code carries inherited licenses are not addressed.",
    198           "source": "haiku"
    199         },
    200         "intended_use_specified": {
    201           "applies": true,
    202           "answer": false,
    203           "justification": "Intended uses (LLM fine-tuning for RTL tasks, PPA prediction) are described, but the paper does not specify what should NOT be concluded from benchmark results or known limitations of the intended use cases.",
    204           "source": "haiku"
    205         }
    206       }
    207     }
    208   },
    209   "claims": [
    210     {
    211       "claim": "Fine-tuning LLMs on DeepCircuitX leads to significant performance improvements across all RTL understanding, completion, and generation metrics compared to non-fine-tuned counterparts.",
    212       "evidence": "Tables VI and VII show consistent gains across CodeLlama, CodeT5+, CodeGen2, CodeGen2.5, and DeepSeek models; e.g., CodeGen2.5 BLEU-4 rises from 0.11 to 13.69, and Pass@1 on RTLLM rises from 17.24% to 24.14% (though the original was already non-trivial).",
    213       "supported": "moderate"
    214     },
    215     {
    216       "claim": "DeepCircuitX is the first comprehensive repository-level RTL dataset combining multilevel code with netlists and PPA metrics.",
    217       "evidence": "Related work in Section II systematically compares with prior datasets (RTL-Repo, RTLLM, VerilogEval, CircuitNet) and identifies their limitations (file-level only, no PPA data), supporting novelty of the combination.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "CoT annotations generated by GPT-4 and Claude are high quality, as confirmed by independent expert human evaluation.",
    222       "evidence": "Table V shows all six metrics (repo and module annotation accuracy, completeness, clarity) score above 3.5/4 from 5 reviewers per sample; evaluator selection and sample size are not reported.",
    223       "supported": "moderate"
    224     },
    225     {
    226       "claim": "PPA prediction for practical designs (>10k cells) remains an open challenge, particularly for delay prediction.",
    227       "evidence": "Table VIII shows delay MAPE of 4.74 (SNS) and 3.48 (MasterRTL) even at 100% training data, with RRSE values >2; the paper attributes this to logic synthesis optimization complexity.",
    228       "supported": "strong"
    229     },
    230     {
    231       "claim": "Models of different scales (220M to 16B) all benefit from fine-tuning on DeepCircuitX, demonstrating dataset adaptability.",
    232       "evidence": "Table VI shows CodeT5+ 220M improves from BLEU-4 0.14 to 4.91, while 7B and 16B models show comparable or larger gains, supporting scale-agnostic benefit.",
    233       "supported": "strong"
    234     },
    235     {
    236       "claim": "DeepCircuitX covers 77 functional categories across chip, IP, and module designs with over 4,000 repositories and 140,000 RTL files.",
    237       "evidence": "Table I confirms: 17 chip-level categories (1,002 repos, 54,650 files), 3 IP-level (1,410 repos, 92,467 files), 57 module-level (2,383 repos, 38,692 files).",
    238       "supported": "strong"
    239     }
    240   ],
    241   "methodology_tags": [
    242     "benchmark-eval",
    243     "observational"
    244   ],
    245   "key_findings": "DeepCircuitX introduces a repository-level RTL dataset of 4,000+ projects spanning chip, IP, and module designs, enriched with multi-level CoT annotations (GPT-4/Claude) and synthesized PPA metrics across five technology nodes. Fine-tuning LLMs on this dataset consistently outperforms non-fine-tuned baselines on RTL understanding (BLEU-4, METEOR, ROUGE) and generation/completion (Pass@k on RTLLM and VerilogEval), across model scales from 220M to 16B parameters. Human evaluators rated annotation quality above 3.5/4 on accuracy, completeness, and clarity. PPA prediction remains an open problem, especially for delay estimation on large designs (>10k cells), where all tested models show high error rates even with full training data.",
    246   "red_flags": [
    247     {
    248       "flag": "GitHub contamination unaddressed",
    249       "detail": "All evaluated LLMs were pretrained on GitHub code; the dataset is collected from GitHub using keyword search with no temporal split, canary strings, or deduplication against model training sets — making benchmark results potentially optimistic due to data contamination."
    250     },
    251     {
    252       "flag": "PPA test set n=10",
    253       "detail": "The PPA prediction evaluation uses only 10 test designs, making statistical conclusions about model performance unreliable; confidence intervals and variance are not reported."
    254     },
    255     {
    256       "flag": "No human baseline for benchmark tasks",
    257       "detail": "Human evaluation measures annotation quality only; no human performance baseline is provided for RTL understanding, completion, or generation tasks, making it impossible to assess how far models are from human-level performance."
    258     },
    259     {
    260       "flag": "Proxy metrics for understanding",
    261       "detail": "BLEU/METEOR/ROUGE measure surface n-gram overlap with reference text, not functional or semantic understanding of RTL code; the paper uses these without justification or discussion of their validity for this domain."
    262     },
    263     {
    264       "flag": "No statistical significance testing",
    265       "detail": "Performance improvements are described as 'significant' throughout but no statistical tests, confidence intervals, or variance across runs are reported."
    266     },
    267     {
    268       "flag": "No license for dataset",
    269       "detail": "The dataset is collected from GitHub repositories (which carry individual licenses) and annotated with GPT-4/Claude outputs (which may carry usage restrictions); no license or terms of use are specified for the released dataset."
    270     },
    271     {
    272       "flag": "No limitations section",
    273       "detail": "The paper has no dedicated limitations or threats-to-validity section; the conclusion only mentions open problems in PPA prediction without discussing limitations of the dataset or benchmark methodology."
    274     }
    275   ],
    276   "cited_papers": [
    277     {
    278       "title": "RTL-Repo: A Benchmark for Evaluating LLMs on Large-Scale RTL Design Projects",
    279       "relevance": "Direct predecessor establishing the concept of repository-level RTL benchmarking; DeepCircuitX explicitly builds on and extends this work."
    280     },
    281     {
    282       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    283       "relevance": "Used as one of two primary evaluation benchmarks for RTL code completion and generation in the paper's experiments (Table VII)."
    284     },
    285     {
    286       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    287       "relevance": "Second primary evaluation benchmark used for Pass@k scoring in RTL generation experiments."
    288     },
    289     {
    290       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    291       "relevance": "Foundational method adopted for the CoT annotation methodology central to DeepCircuitX's annotation pipeline."
    292     },
    293     {
    294       "title": "CircuitNet: An Open-Source Dataset for Machine Learning Applications in Electronic Design Automation",
    295       "relevance": "Key prior EDA dataset that DeepCircuitX differentiates from by providing RTL-level rather than post-synthesis layout data."
    296     },
    297     {
    298       "title": "MasterRTL: A Pre-Synthesis PPA Estimation Framework for Any RTL Design",
    299       "relevance": "One of the PPA prediction models evaluated on DeepCircuitX in Table VIII; provides direct comparison baseline."
    300     },
    301     {
    302       "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation",
    303       "relevance": "Prior work collecting 50,000 open-source Verilog samples for LLM fine-tuning; directly compared as a file-level-only dataset that DeepCircuitX supersedes."
    304     },
    305     {
    306       "title": "MG-Verilog: Multi-Grained Dataset Towards Enhanced LLM-Assisted Verilog Generation",
    307       "relevance": "Contemporary dataset for LLM-assisted Verilog generation; cited as an example of existing high-quality hardware data efforts that DeepCircuitX extends."
    308     }
    309   ],
    310   "engagement_factors": {
    311     "practical_relevance": {
    312       "score": 2,
    313       "justification": "Directly usable by researchers fine-tuning LLMs for hardware design automation, but the domain (RTL/EDA) is narrow and requires specialized infrastructure (commercial EDA tools like Synopsys Design Compiler) to reproduce."
    314     },
    315     "surprise_contrarian": {
    316       "score": 1,
    317       "justification": "The finding that domain-specific fine-tuning improves performance is expected; the PPA delay prediction difficulty is a useful negative result but not surprising to EDA practitioners."
    318     },
    319     "fear_safety": {
    320       "score": 0,
    321       "justification": "Hardware design dataset with no AI safety, security, or risk implications."
    322     },
    323     "drama_conflict": {
    324       "score": 0,
    325       "justification": "No controversy, competing claims, or replication disputes involved."
    326     },
    327     "demo_ability": {
    328       "score": 2,
    329       "justification": "Dataset is publicly available at the gitbook URL; researchers can download and fine-tune their own models, though reproducing PPA synthesis results requires commercial EDA tools."
    330     },
    331     "brand_recognition": {
    332       "score": 1,
    333       "justification": "CUHK (The Chinese University of Hong Kong) has moderate recognition in the EDA/ML community; no major industry lab affiliation."
    334     }
    335   },
    336   "hn_data": {
    337     "threads": [],
    338     "top_points": 0,
    339     "total_points": 0,
    340     "total_comments": 0
    341   }
    342 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs