scan.json (27289B)
1 { 2 "paper": { 3 "title": "ChipBench: A Next-Step Benchmark for Evaluating LLM Performance in AI-Aided Chip Design", 4 "authors": [ 5 "Zhongkai Yu", 6 "Chenyang Zhou", 7 "Yichen Lin", 8 "Hejia Zhang", 9 "Haotian Ye", 10 "Junxia Cui", 11 "Zaifeng Pan", 12 "Jishen Zhao", 13 "Yufei Ding" 14 ], 15 "year": 2026, 16 "venue": "Preprint (arXiv)", 17 "arxiv_id": "2601.21448" 18 }, 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract states 'Our code is available at https://github.com/zhongkaiyu/ChipBench.git' — a direct GitHub URL is provided." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The benchmark data (44 Verilog generation cases, 89 debugging cases, 132 reference model cases) is released through the same GitHub repository. The paper describes gold Verilog implementations, test files, and prompts as part of the released benchmark." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is described in the paper. The paper mentions iVerilog and Verilator as tools but does not specify versions or dependency details for reproducing the evaluation environment." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "While the framework workflow is described conceptually (Section 2.4, Figure 3), the paper does not provide step-by-step reproduction instructions, README commands, or scripts to replicate the main experiments. The workflow description is at a design level, not an actionable instruction level." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Tables 2, 3, and 4 report only point estimates for pass@1, pass@5, and pass@10. No confidence intervals, error bars, or uncertainty measures are provided." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper makes comparative claims (e.g., 'debugging capability exceeds generation capability', 'Python generation outperforms Verilog on simple modules') but provides no statistical significance tests — comparisons are based solely on comparing raw percentages." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports effect sizes with baseline context. For example, 'GPT-4o achieves ~65% pass@1 on RTLLM V2 and VerilogEval V2, but only 10.37% on our dataset' and 'MAGE achieves only 37.41% on ChipBench' vs. '95% accuracy on VerilogEval.' These provide enough context to understand the magnitude of differences." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification is provided for why 44 Verilog generation cases, 89 debugging cases, or 132 reference model cases were chosen. No power analysis or discussion of whether these sample sizes are sufficient for the claims being made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper reports pass@1, pass@5, and pass@10 but does not report standard deviation, variance, or spread across multiple runs. The evaluation appears to use sampling (temperature=0.85, top_p=0.95) but does not report run-to-run variance." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper compares 13 models (GPT-3.5 through GPT-5.2, Claude family, Gemini family, DeepSeek family, Llama family) and the MAGE multi-agent system. It also compares ChipBench characteristics against existing benchmarks VerilogEval and RTLLM in Table 1." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The baselines include very recent models: GPT-5.2 (Dec 2025), Gemini 3 Flash (Dec 2025), Claude 4.5 Opus (Nov 2025), and MAGE. These represent state-of-the-art at time of writing." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": false, 83 "justification": "The benchmark has multiple design decisions (three task categories, four bug types, three reference model languages, zero-shot vs one-shot debugging) but no ablation study examines how these design choices affect the benchmark's ability to discriminate between models. The zero-shot vs one-shot comparison (Section 3.3) is a task variant analysis, not a benchmark design ablation." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper reports pass@1, pass@5, and pass@10, and also provides per-category breakdowns (CPU IP, Non-Self-Contained, Self-Contained) and per-bug-type breakdowns (arithmetic, assignment, state machine, timing). Cost analysis is also provided in Table 5." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a benchmark paper evaluating LLM code generation via automated test suites (iVerilog simulation, Verilator comparison). Human evaluation of LLM outputs is not relevant — the pass/fail evaluation is deterministic via simulation." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "This paper introduces a new benchmark rather than training a model. There is no dev/test split needed — all 264+ test cases are the benchmark itself." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Tables 2, 3, and 4 all provide per-category breakdowns: Verilog generation is broken down by CPU IP, Non-Self-Contained, and Self-Contained; debugging is broken down by bug type (arithmetic, assignment, state machine, timing)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 3.3 discusses failure patterns in one-shot debugging ('most current LLMs lack the capability to interpret [waveform data] effectively'). Section 3.2 notes CPU IP design as the most challenging task with no model exceeding 22.22%. The paper also notes that 'all models failed when tasked with generating complete hierarchical designs' (Section 2.2)." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper reports that one-shot debugging 'underperforms on 13' out of 21 model configurations compared to zero-shot (Section 3.3), which is a surprising negative finding. It also reports 0% pass rates for several models on CPU IP tasks." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims Claude-4.5-opus achieves 30.74% on Verilog generation (confirmed in Table 2) and 13.33% on Python reference model generation (confirmed in Table 3, noting the abstract says 13.33% but the paper body says 15.93% for Claude Sonnet — the 13.33% is for Claude Opus, which is stated correctly). The abstract's claim of '95% pass rates' on existing benchmarks is supported by the MAGE reference in the introduction." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper makes causal claims like 'Python generation outperforms Verilog on simple modules... likely due to Python's prevalence in LLM pre-training data' (Section 3.2) without evidence for this causal mechanism. It also claims 'most current LLMs lack the capability to interpret [waveform data] effectively' (Section 3.3) based on aggregate pass rate comparisons without controlling for confounds." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper title and framing ('AI-Aided Chip Design') suggests broad applicability, but the benchmark covers only 44 modules from specific open-source CPU IPs and competitive platforms. The paper does not bound its claims to these specific sources. Claims about 'industrial deployment requirements' are made without testing on actual industrial designs." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not discuss alternative explanations for its main findings. For example, the lower performance on ChipBench vs VerilogEval could be partly due to longer prompts exceeding context windows, test harness differences, or other factors beyond module complexity. The one-shot debugging underperformance could be due to prompt format rather than LLM capability. No threats-to-validity or alternative explanations section is present." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Table 6 in the Appendix provides exact API model names with snapshot dates for all evaluated models (e.g., 'gpt-5.2-2025-12-11', 'claude-opus-4-5-20251101', 'gpt-4o-2024-11-20'). This is excellent version specification." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes the prompting approach conceptually (Section 2.2: 'we manually wrote description prompts'; Section 2.3: debugging modes provide 'module description and buggy implementation') but the actual prompt text used for evaluation is not shown in the paper. The prompts are presumably in the GitHub repository but not presented in the paper or appendix." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.1 states 'For all models, we use the hyperparameters recommended by VerilogEval: temperature=0.85 and top_p=0.95.' These are the key sampling parameters." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": false, 157 "justification": "The MAGE multi-agent system is tested but described only as 'a state-of-the-art multi-agent framework for Verilog generation' that 'employs dedicated agents for sampling, debugging, and decision-making.' The actual scaffolding details (tool descriptions, retry logic, decision criteria) are not described — the reader is referred to the original MAGE paper." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 2.2 describes how test cases were sourced ('open-source Online Judge platforms and established CPU IP projects'), categorized into three groups, and how prompts were manually written. Section 2.3 describes how debugging cases were constructed by manually injecting faults. The data pipeline from source to benchmark is well-documented." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The paper has an 'Impact Statement' but it only says 'There are many potential societal consequences of our work, none which we feel must be specifically highlighted here.' The 'Take-Aways' section (Section 4) discusses future directions, not limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address potential issues with benchmark size (44 Verilog generation cases), selection bias in module choice, or limitations of the evaluation methodology." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the specific module types tested or acknowledge that 44 modules may not represent the full diversity of industrial chip designs." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The benchmark data (golden Verilog, test files, prompts, buggy implementations) is released at the GitHub repository. The raw evaluation data (model outputs) is not explicitly mentioned but the benchmark inputs are available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 2.2 describes sourcing from 'open-source Online Judge (OJ) platforms and established CPU IP projects.' Section 2.3 describes how debugging cases were created by manually injecting faults into golden modules. Table 7 lists all 44 modules with descriptions." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The data comes from open-source code repositories and benchmarking of LLM APIs." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from golden Verilog modules through bug injection (Section 2.3), test harness creation (Section 2.4), and reference model generation (Section 2.5) is documented with workflow diagrams (Figure 3). The benchmark construction pipeline is clear." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: UC San Diego Department of Computer Science and Engineering (authors 1, 3-8) and Columbia University (author 2)." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding source is disclosed, so independence cannot be assessed. The paper evaluates products from multiple commercial companies (Anthropic, OpenAI, Google, Meta) without disclosing any potential funding relationships." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial disclosure is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not state training data cutoff dates for any of the evaluated models. Table 6 lists release dates but not training data cutoffs. Since the benchmark uses modules from open-source projects, contamination is a relevant concern." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The benchmark modules are sourced from 'open-source Online Judge (OJ) platforms and established CPU IP projects' which may be in LLM training data. The paper does not discuss whether any of these modules could have been seen during model training." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The benchmark uses open-source CPU IP projects and online judge problems that may have been available online before model training cutoffs. This contamination risk is not addressed. While the authors note the benchmark is new, the underlying source code it draws from is pre-existing open-source material." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this benchmark evaluation study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are involved." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 5 provides detailed cost analysis including input tokens, output tokens, cost in dollars, and cost per pass@1 for each model. Section 3.4 also reports costs for the training dataset generation tool (Figure 6)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 5 reports token consumption and API costs for each model. For the toolbox evaluation, Figure 6 shows total cost scaling with max debugging turns. While GPU hours are not reported (API-based evaluation), the API cost budget is clearly stated." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "State-of-the-art Claude-4.5-opus achieves only 30.74% pass@1 on Verilog generation and 13.33% on Python reference model generation on ChipBench.", 296 "evidence": "Table 2 shows Claude 4.5 Opus at 30.74% average pass@1 for Verilog generation. Table 3 shows Claude 4.5 Opus at 13.33% average pass@1 for Python reference model generation.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "MAGE, which exceeds 95% accuracy on VerilogEval, achieves only 37.41% on ChipBench.", 301 "evidence": "Table 2 shows MAGE (DeepSeek-V3) at 37.41% pass@1 average. The 95% on VerilogEval claim is attributed to Zhao et al., 2025b in the introduction.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "LLMs consistently perform better on Verilog debugging than generation when given the same module.", 306 "evidence": "Section 3.2 compares Claude-opus at 47.45% debugging (Table 4) vs 30.74% generation (Table 2). This pattern holds across models, with 5%-20% higher pass rates stated for debugging.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "One-shot debugging (with waveform data) does not consistently outperform zero-shot debugging, suggesting LLMs lack waveform interpretation capability.", 311 "evidence": "Section 3.3 and Figure 5 report that one-shot outperforms zero-shot on 8 models but underperforms on 13.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "For simple self-contained modules, LLMs achieve 5%-20% higher accuracy on Python reference model generation compared to Verilog generation.", 316 "evidence": "Comparing Table 3 self-contained columns with Table 2 self-contained columns shows higher Python pass rates for most models on simple modules (e.g., Claude Opus: 40% Python vs 36.67% Verilog).", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "The automated toolbox can generate approximately n/2 Python reference models from n Verilog training samples with multi-turn debugging.", 321 "evidence": "Section 3.4 reports 2,206 verified models from 10,000 samples (22%), and Figure 6 shows pass rate plateauing around 50% with higher max turns on a 100-sample subset.", 322 "supported": "weak" 323 } 324 ], 325 "methodology_tags": [ 326 "benchmark-eval" 327 ], 328 "key_findings": "ChipBench introduces a benchmark with 264+ test cases spanning Verilog generation, debugging, and reference model generation that reveals significant performance gaps in current LLMs for chip design tasks. The best-performing model (Claude 4.5 Opus) achieves only 30.74% pass@1 on Verilog generation and 13.33% on Python reference model generation, demonstrating that existing saturated benchmarks with 95%+ pass rates overestimate LLM capabilities for industrial deployment. LLMs show stronger debugging ability than generation ability, and one-shot debugging with waveform data does not consistently improve performance, suggesting LLMs struggle with waveform interpretation.", 329 "red_flags": [ 330 { 331 "flag": "No contamination analysis", 332 "detail": "The benchmark draws modules from open-source CPU IPs and online judge platforms. These are public repositories that are very likely in the training data of the evaluated models. No contamination analysis, temporal split, or canary string approach is used. This is especially concerning for the self-contained modules from OJ platforms, which resemble common coding contest problems." 333 }, 334 { 335 "flag": "No limitations section", 336 "detail": "The paper has no limitations or threats-to-validity section. For a benchmark paper, key missing discussions include: selection bias in module choice, small benchmark size (44 Verilog generation cases), representativeness of OJ and open-source CPU IP problems for industrial workflows, and the gap between the benchmark's claim to represent 'industrial complexity' and its actual source material." 337 }, 338 { 339 "flag": "Small and potentially unrepresentative benchmark", 340 "detail": "With only 44 Verilog generation cases, 6 non-self-contained cases, and 9 CPU IP cases, per-category sample sizes are very small. The CPU IP category has only 9 cases, yet claims about model capabilities on industrial designs are drawn from these. Individual category pass rates could shift dramatically with a few additional cases." 341 }, 342 { 343 "flag": "No statistical uncertainty quantification", 344 "detail": "Despite using stochastic sampling (temperature=0.85, top_p=0.95), the paper reports no confidence intervals, standard deviations, or significance tests. The pass@5 and pass@10 metrics require multiple samples, but variance across sampling runs is not reported. Differences between models could be within noise." 345 }, 346 { 347 "flag": "Unbounded generalization claims", 348 "detail": "The paper claims the benchmark reflects 'industrial deployment requirements' and 'real-world industrial scenarios,' but the modules come from open-source CPU IPs and competitive platforms. Industrial Verilog modules typically exceed 10,000 lines (as the paper itself notes), while ChipBench averages 61.7 lines. The gap between the benchmark and true industrial complexity is acknowledged but not reflected in the paper's claims." 349 } 350 ], 351 "cited_papers": [ 352 { 353 "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation", 354 "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"], 355 "year": 2023, 356 "relevance": "Seminal benchmark for LLM Verilog generation evaluation; ChipBench directly compares against and extends this benchmark." 357 }, 358 { 359 "title": "MAGE: A Multi-Agent Engine for Automated RTL Code Generation", 360 "authors": ["Yiliang Zhao", "Hejia Zhang", "Hanyi Huang", "Zhongkai Yu", "Jishen Zhao"], 361 "year": 2025, 362 "relevance": "State-of-the-art multi-agent framework for Verilog generation evaluated as a baseline, achieving 95% on VerilogEval but only 37% on ChipBench." 363 }, 364 { 365 "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model", 366 "authors": ["Yao Lu", "Shang Liu", "Qijun Zhang", "Zhiyao Xie"], 367 "year": 2024, 368 "relevance": "Competing benchmark for LLM-based RTL generation that ChipBench claims to supersede due to saturation." 369 }, 370 { 371 "title": "Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation", 372 "authors": ["Nathaniel Pinckney", "Christopher Batten", "Mingjie Liu", "Haoxing Ren", "Brucek Khailany"], 373 "year": 2025, 374 "relevance": "Updated version of VerilogEval benchmark; ChipBench inherits its evaluation framework and compares against its design choices." 375 }, 376 { 377 "title": "RealBench: Benchmarking Verilog Generation Models with Real-World IP Designs", 378 "authors": ["Peng Jin", "Dong Huang", "Cheng Li"], 379 "year": 2025, 380 "relevance": "Contemporary benchmark using real-world IP designs; described as complementary to ChipBench in evaluating LLM chip design capabilities." 381 }, 382 { 383 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 384 "authors": ["DeepSeek-AI"], 385 "year": 2025, 386 "arxiv_id": "2501.12948", 387 "relevance": "One of the evaluated models with reasoning capabilities; relevant to understanding LLM reasoning in code generation tasks." 388 }, 389 { 390 "title": "A Survey on Code Generation with LLM-based Agents", 391 "authors": ["Yihong Dong", "Xue Jiang", "Jian Qian"], 392 "year": 2025, 393 "arxiv_id": "2508.00083", 394 "relevance": "Survey of LLM-based agent code generation techniques, providing context for agentic approaches to chip design." 395 }, 396 { 397 "title": "VeriGen: A Large Language Model for Verilog Code Generation", 398 "authors": ["Shailja Thakur", "Baleegh Ahmad", "Hammond Pearce", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri", "Siddharth Garg"], 399 "year": 2024, 400 "relevance": "Prior work on LLM-based Verilog generation providing training datasets (50k samples) used with ChipBench toolbox." 401 }, 402 { 403 "title": "QiMeng-CodeV-R1: Reasoning-Enhanced Verilog Generation", 404 "authors": ["Yutong Zhu", "Dong Huang", "Haoran Lyu"], 405 "year": 2025, 406 "relevance": "Provides training dataset (87k samples) used for toolbox evaluation and contains the CodeV-R1 dataset used for reference model generation experiments." 407 }, 408 { 409 "title": "Pro-V: An Efficient Program Generation Multi-Agent System for Automatic RTL Verification", 410 "authors": ["Yiliang Zhao", "Zhifang Wu", "Hejia Zhang", "Zhongkai Yu"], 411 "year": 2025, 412 "arxiv_id": "2506.12200", 413 "relevance": "Multi-agent system for RTL verification including reference model generation as a sub-task; ChipBench argues it benefits from accurate reference models." 414 }, 415 { 416 "title": "Large Language Models for EDA: Future or Mirage?", 417 "authors": ["Zhiyao He", "Yu Pu", "Hao Wu", "Tiancheng Qiu", "Bei Yu"], 418 "year": 2025, 419 "relevance": "Comprehensive survey on LLMs for electronic design automation, providing broader context for the chip design benchmarking landscape." 420 } 421 ] 422 }