scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20181B)
      1 {
      2   "paper": {
      3     "title": "Comprehensive Verilog Design Problems: A Next-Generation Benchmark Dataset for Evaluating Large Language Models and Agents on RTL Design and Verification",
      4     "authors": ["Nathaniel Pinckney", "Chenhui Deng", "Chia-Tung Ho", "Yun-Da Tsai", "Mingjie Liu", "Wenfei Zhou", "Brucek Khailany", "Haoxing Ren"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2506.14074"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive link is provided in the paper. The paper describes infrastructure but does not release it."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset download link is provided. The benchmark is described but no URL for accessing the 783 problems is given."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper specifies Docker containers for evaluation, names specific open-source tools (Icarus Verilog, Yosys, Verilator) with versions, and mentions Cadence Xcelium for commercial tasks."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The evaluation flow is described at a high level (Figure 1) but not with actionable commands."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Tables 2-4 report only point estimates (e.g., '33.56%') with no confidence intervals or error bars despite using n=5 samples."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares models (e.g., Claude 3.7 Sonnet vs GPT 4.1) without any statistical significance tests. Differences are stated as raw numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes reported. Differences between models are stated as raw percentage points (e.g., '8% lower') without formal effect size measures."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "n=5 samples per problem is stated but not justified. No power analysis or rationale for why 5 samples is sufficient for reliable pass@1 estimation."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviation, variance, or spread measures reported across the n=5 samples. Only point estimates of pass@1 are given."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple models are compared (Claude 3.7 Sonnet, GPT 4.1, GPT o1, o4-mini, Llama 3.1 405B/70B). Prior benchmark results (VerilogEval v2) are referenced for context."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Models evaluated include very recent releases: GPT 4.1 (April 2025), o4-mini (April 2025), Claude 3.7 Sonnet (2025). These represent state-of-the-art."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study on benchmark design choices (e.g., effect of quality filtering on results is mentioned but not systematically ablated as a design decision)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Pass@1 for code generation, BLEU for correspondence tasks, and LLM-based judging for Q&A tasks. Multiple metrics across different task types."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of model outputs. Code generation is evaluated by automated test harnesses, and comprehension is scored by BLEU and LLM-based judging (GPT o4-mini)."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a benchmark paper; the entire dataset serves as a test set for evaluating models. There is no training/tuning involved."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Tables 2-4 provide per-category breakdowns across all 13 task categories (cid02-cid16). Table 5 provides detailed per-category failure analysis."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5 provides systematic failure analysis with clustering of failure types, visualizations (Figures 2-3), and specific failure examples in Appendix A."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that models struggle significantly on verification tasks (e.g., 0% pass@1 for Claude 3.7 Sonnet on agentic cid13), and that extended thinking mode provides minimal benefit."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims ('no more than 34% pass@1', 'agentic tasks are particularly difficult') are supported by Tables 2-3 showing Claude 3.7 Sonnet at 33.56% and lower agentic scores."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper suggests causal explanations (e.g., 'This discrepancy may stem from the more procedural and imperative nature of testbench code') without controlled experiments to verify."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The Limitations section (Section 6) explicitly bounds scope: 'limited to standard hardware design and verification tasks', acknowledges Q&A tasks don't sufficiently challenge LLMs, and notes agentic context is oracle context."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not substantively discuss alternative explanations for the performance gaps. For instance, whether low verification pass rates are due to benchmark difficulty, prompt quality, or model limitations is not explored."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are listed as 'Claude 3.7 Sonnet', 'GPT 4.1', 'GPT o1', etc. without specific API versions or snapshot dates. Only Llama models have specific parameter counts."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No actual prompt text is provided. The paper describes prompt structure at a high level but does not include the prompts used for evaluation."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4 states: 'For Llama 3.1 405B and 70B, we set the decoding parameters to T=0.2 and top-p=0.7. For the other models we used the default temperature and top-p supported by the API endpoint.'"
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper evaluates agentic tasks in single-turn (non-agentic) format, noting 'no open-source, general-purpose hardware design agent currently exists.' No agentic scaffolding was used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2 describes the quality filtering pipeline: problems authored by experienced engineers, quality control via LLM judge scoring, automated filtering removing low-quality examples. Counts before/after filtering are given."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 'Limitations' provides a dedicated discussion of benchmark limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6 discusses specific threats: agentic context is oracle context (not realistic), Q&A tasks don't sufficiently challenge LLMs, tasks limited to standard hardware design, and commercial tool requirements for some categories."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6: 'the tasks in the benchmark are limited to standard hardware design and verification tasks and do not encompass the full range of challenges a design or verification engineer might face from project inception through fabrication.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (individual model outputs, per-problem results) is released. Only aggregate pass rates are reported in tables."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2 describes how problems were authored by experienced hardware engineers across 13 categories, with difficulty levels and topical diversity requirements."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The acknowledgments thank 'Turing for their collaboration in developing the benchmark dataset' but do not describe how the hardware engineers who authored problems were recruited or selected."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from problem authoring through quality filtering is described: problems authored per category, quality control via LLM judge, automated filtering. Table 1 shows problem counts. Pre/post-filtering pass rate differences are quantified."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure. The acknowledgments mention Turing collaboration and Cadence tool licenses but no explicit funding source."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed as NVIDIA employees with NVIDIA email addresses."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "NVIDIA, as the employer of all authors, has a commercial interest in hardware design automation. Their benchmark could influence perception of AI capabilities in their domain. No independence statement is made."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "Only GPT 4.1's knowledge cutoff (June 2024) is mentioned via the reference URL. No training cutoffs are stated for other models, and no systematic discussion of cutoff dates."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the benchmark problems or similar Verilog code appeared in any model's training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not discuss contamination risk. Since this is a new benchmark, contamination risk is lower, but the paper does not explicitly address this."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study. Problems were authored by engineers but no human subjects research was conducted."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants study requiring IRB approval."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token counts, or latency figures reported despite evaluating 7 models across 783 problems with n=5 samples each."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total compute budget, GPU hours, or API spend reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "State-of-the-art models achieve no more than 34% pass@1 on CVDP code generation tasks.",
    286       "evidence": "Table 2 shows Claude 3.7 Sonnet at 33.56% overall pass@1, the highest among all models tested.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CVDP is substantially more challenging than prior Verilog benchmarks like VerilogEval v2.",
    291       "evidence": "Section 4 compares: VerilogEval v2 reported 57% for LLaMA 3.1 405B and 63% for GPT-4o, while CVDP yields 23% and 29% respectively for comparable models.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Agentic tasks are more difficult than non-agentic tasks when evaluated in single-turn format.",
    296       "evidence": "Tables 2-3 show GPT 4.1 drops from 29% to 21% and Claude 3.7 Sonnet drops from 34% to 29% between non-agentic and agentic problems.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Design verification tasks (testbench/assertion generation) exhibit substantially lower pass rates than RTL generation.",
    301       "evidence": "Table 2 shows cid13 (testbench checker gen) at 3-10% across models vs cid02-04 (RTL gen) at 18-48%.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Extended thinking mode provides minimal benefit for Claude 3.7 Sonnet on these tasks.",
    306       "evidence": "Tables 2-3 show Claude 3.7 Sonnet with and without thinking achieve nearly identical overall pass rates (33.56% vs 33.04% non-agentic, 29% vs 29% agentic).",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The CVDP benchmark introduces 783 expert-authored Verilog design problems across 13 categories, substantially more challenging than prior benchmarks. State-of-the-art LLMs achieve at most 34% pass@1 on code generation, with design verification tasks (testbench and assertion generation) proving especially difficult at single-digit pass rates. The failure analysis reveals that verification tasks produce more diverse failure clusters than RTL generation tasks, suggesting deeper capability gaps in procedural/imperative code generation.",
    312   "red_flags": [
    313     {
    314       "flag": "Company evaluating domain relevance",
    315       "detail": "All 8 authors are NVIDIA employees. NVIDIA has commercial interest in hardware design automation. The benchmark could shape perception of AI readiness for EDA workflows in ways that affect NVIDIA's business strategy."
    316     },
    317     {
    318       "flag": "No statistical uncertainty quantification",
    319       "detail": "Despite using n=5 samples per problem, no confidence intervals, error bars, or variance measures are reported. Differences between models (e.g., 34% vs 29%) may not be statistically significant."
    320     },
    321     {
    322       "flag": "Benchmark and code not publicly released",
    323       "detail": "The paper describes a benchmark but provides no download link, repository URL, or archive. Results cannot be independently verified or reproduced."
    324     },
    325     {
    326       "flag": "LLM-as-judge without validation",
    327       "detail": "GPT o4-mini is used as the scoring model for Q&A tasks, but no validation of this LLM judge against human ratings is provided. The paper acknowledges 'further investigation is needed to assess the technical reliability of these scores.'"
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "VerilogEval: Evaluating large language models for Verilog code generation",
    333       "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"],
    334       "year": 2023,
    335       "relevance": "Prior Verilog code generation benchmark that CVDP extends and supersedes."
    336     },
    337     {
    338       "title": "SWE-bench: Can language models resolve real-world github issues?",
    339       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    340       "year": 2024,
    341       "relevance": "Influential software engineering benchmark for LLM agents; CVDP is analogous for hardware design."
    342     },
    343     {
    344       "title": "CraftRTL: High-quality synthetic data generation for verilog code models with correct-by-construction non-textual representations and targeted code repair",
    345       "authors": ["Mingjie Liu", "Yun-Da Tsai", "Wenfei Zhou", "Haoxing Ren"],
    346       "year": 2025,
    347       "relevance": "Synthetic data generation for Verilog LLMs, directly related to improving model capability on hardware code generation."
    348     },
    349     {
    350       "title": "VerilogCoder: Autonomous verilog coding agents with graph-based planning and abstract syntax tree (AST)-based waveform tracing tool",
    351       "authors": ["Chia-Tung Ho", "Haoxing Ren", "Brucek Khailany"],
    352       "year": 2024,
    353       "arxiv_id": "2408.08927",
    354       "relevance": "Agentic Verilog coding system directly relevant to the agentic evaluation format in CVDP."
    355     },
    356     {
    357       "title": "Copilot evaluation harness: Evaluating LLM-guided software programming",
    358       "authors": ["Anisha Agarwal", "Aaron Chan", "Shubham Chandel"],
    359       "year": 2024,
    360       "arxiv_id": "2402.14261",
    361       "relevance": "LLM evaluation framework for code generation, methodologically related to CVDP's evaluation infrastructure."
    362     },
    363     {
    364       "title": "RTL-Repo: A benchmark for evaluating LLMs on large-scale RTL design projects",
    365       "authors": ["Ahmed Allam", "Mohamed Shalan"],
    366       "year": 2024,
    367       "relevance": "Alternative RTL benchmark focusing on large-scale design projects, complementary to CVDP."
    368     },
    369     {
    370       "title": "Large language model for verilog generation with code-structure-guided reinforcement learning",
    371       "authors": ["Ning Wang", "Bingkun Yao", "Jie Zhou", "Xi Wang", "Zhe Jiang", "Nan Guan"],
    372       "year": 2025,
    373       "arxiv_id": "2407.18271",
    374       "relevance": "RL-based approach to improving LLM Verilog generation, directly evaluated on similar tasks."
    375     },
    376     {
    377       "title": "LLM4DV: Using large language models for hardware test stimuli generation",
    378       "authors": ["Zixi Zhang", "Greg Chadwick", "Hugo McNally", "Yiren Zhao", "Robert Mullins"],
    379       "year": 2023,
    380       "arxiv_id": "2310.04535",
    381       "relevance": "LLM-based hardware test generation, directly relevant to CVDP's verification task categories."
    382     }
    383   ]
    384 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs