scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22311B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Large Language Model for Verilog Code Generation: Literature Review and the Road Ahead",
      6     "authors": [
      7       "Guang Yang",
      8       "Wei Zheng",
      9       "Dong Liang",
     10       "Peng Hu",
     11       "Yukui Yang",
     12       "Shaohang Peng",
     13       "Zhenghan Li",
     14       "Jiahui Feng",
     15       "Xiao Wei",
     16       "Kexin Sun",
     17       "Deyuan Ma",
     18       "Haotian Cheng",
     19       "Yiheng Shen",
     20       "Xiang Chen",
     21       "Xing HU",
     22       "Terry Yue Zhuo",
     23       "David Lo"
     24     ],
     25     "year": 2025,
     26     "venue": "arXiv.org",
     27     "arxiv_id": "2512.00020",
     28     "doi": "10.48550/arXiv.2512.00020"
     29   },
     30   "checklist": {
     31     "claims_and_evidence": {
     32       "abstract_claims_supported": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The abstract claims to analyze 102 papers across 4 RQs covering LLMs used, datasets/metrics, optimization techniques, and alignment approaches; all four RQs have dedicated sections with supporting evidence from the corpus.",
     36         "source": "haiku"
     37       },
     38       "causal_claims_justified": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a descriptive systematic review making no causal claims; trend observations (e.g., open-source adoption growing faster) are framed as patterns, not causal mechanisms.",
     42         "source": "haiku"
     43       },
     44       "generalization_bounded": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The survey explicitly scopes to 2020-2025 and Verilog code generation specifically; Table 1 delineates what prior surveys cover vs. this paper's contribution, and claims stay within the 102-paper corpus.",
     48         "source": "haiku"
     49       },
     50       "alternative_explanations_discussed": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Trends such as GPT dominance or the rapid growth of open-source LLM adoption are reported without considering alternative explanations such as publication venue bias or citation inflation effects.",
     54         "source": "haiku"
     55       },
     56       "proxy_outcome_distinction": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 5.2 explicitly distinguishes similarity-based metrics (proxies), execution-based metrics (direct functional correctness), and LLM-as-judge metrics, noting that functional-pass@k is the most direct measure.",
     60         "source": "haiku"
     61       }
     62     },
     63     "limitations_and_scope": {
     64       "limitations_section_present": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 9 'Threats to Validity' is a dedicated section addressing three specific threats: paper search omission, study selection bias, and categorization/analysis bias.",
     68         "source": "haiku"
     69       },
     70       "threats_to_validity_specific": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Threats are specific with mitigations: 'fast-paced LLM research may lead to overlooking emerging work, mitigated through rigorous keyword selection and multi-stage filtering' and 'reduction involved subjective quality assessments, mitigated by strict inclusion criteria and a Likert-scale framework requiring 12/15 threshold.'",
     74         "source": "haiku"
     75       },
     76       "scope_boundaries_stated": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Temporal scope (2020-2025), language focus (Verilog only), and task focus (code generation, not verification or testing without generation) are explicitly stated and distinguished from broader EDA surveys in Table 1.",
     80         "source": "haiku"
     81       }
     82     },
     83     "conflicts_of_interest": {
     84       "funding_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 11 (Acknowledgments) discloses partial funding from the National Natural Science Foundation of China (NSFC, No. 62141208).",
     88         "source": "haiku"
     89       },
     90       "affiliations_disclosed": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "All 17 author affiliations are listed in the paper header: Zhejiang University, Northwestern Polytechnical University, Nantong University, Monash University, and Singapore Management University.",
     94         "source": "haiku"
     95       },
     96       "funder_independent_of_outcome": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "NSFC is a Chinese government science foundation with no financial stake in any of the commercial tools, proprietary models, or industry datasets reviewed in this survey.",
    100         "source": "haiku"
    101       },
    102       "financial_interests_declared": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No competing interests statement is provided; the paper only discloses the NSFC grant without any formal declaration of personal financial interests, patents, or consulting relationships.",
    106         "source": "haiku"
    107       }
    108     },
    109     "scope_and_framing": {
    110       "key_terms_defined": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 formally defines Verilog, HDL, EDA workflow phases, RTL, and specifies LLM-based Verilog generation as a formal mapping function f_θ: (D,I) → V; Section 4 distinguishes Base LLMs from IT LLMs with explicit criteria.",
    114         "source": "haiku"
    115       },
    116       "intended_contribution_clear": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Three explicit contribution bullets in the introduction state: (1) first systematic literature review on LLM-based Verilog generation analyzing 102 papers, (2) comprehensive taxonomy across 4 RQs, and (3) key limitations and roadmap.",
    120         "source": "haiku"
    121       },
    122       "engagement_with_prior_work": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Table 1 explicitly compares with six prior surveys (Jiang et al. 2024, Joel et al. 2024, Chen et al. 2025, Fang et al. 2025, He et al. 2024, Chen et al. 2024) showing specific gaps each fails to address that this survey fills.",
    126         "source": "haiku"
    127       }
    128     }
    129   },
    130   "type_checklist": {
    131     "survey": {
    132       "search_and_selection": {
    133         "search_strategy_reproducible": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Section 3.2 fully describes the QGS strategy with six named databases, two explicit keyword sets (25 total terms), temporal filters (2020-2025), and a three-stage filtering process with snowballing; the strategy could be independently replicated.",
    137           "source": "haiku"
    138         },
    139         "inclusion_exclusion_explicit": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Table 2 explicitly lists 3 inclusion criteria and 6 exclusion criteria (e.g., 'short papers < 5 pages,' 'paper is a literature review or survey,' 'LLMs mentioned only in future work'); paper counts at each stage are reported.",
    143           "source": "haiku"
    144         },
    145         "prisma_or_structured_protocol": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "The paper explicitly follows Kitchenham et al.'s SEGRESS SLR guidelines [59] and the Quasi-Gold Standard (QGS) strategy [133], which are established structured review protocols with planning, conducting, and analysis phases.",
    149           "source": "haiku"
    150         },
    151         "search_terms_provided": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Section 3.2.1 provides all search terms verbatim: 12 keywords for Verilog code generation (e.g., 'Verilog,' 'HDL,' 'RTL,' 'EDA,' 'FPGA') and 13 keywords for LLMs (e.g., 'LLM,' 'GPT,' 'fine-tuning,' 'prompt engineering').",
    155           "source": "haiku"
    156         },
    157         "databases_listed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Section 3.2.2 explicitly names all six databases searched: IEEE Xplore, ACM Digital Library, ScienceDirect, Web of Science, SpringerLink, and arXiv.",
    161           "source": "haiku"
    162         },
    163         "screening_process_documented": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Section 3.3.1 documents paper counts at each stage: Stage 1 (short/duplicate removal) → 5,172; Stage 2 (content filtering) → 687; Stage 3 (full-text) → 124; QAC assessment → 85; snowballing → +15 = 102 final.",
    167           "source": "haiku"
    168         },
    169         "review_scope_justified": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "The 2020 start date is justified by citing the field's inception paper (DAVE, 2020); the Verilog-specific focus is justified by the gap analysis in Table 1 showing no prior survey covers this niche comprehensively.",
    173           "source": "haiku"
    174         }
    175       },
    176       "synthesis_quality": {
    177         "conflicting_findings_acknowledged": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The survey presents a unified narrative of progress across all four RQs; it does not explicitly identify cases where reviewed papers report contradictory results or where different approaches yield conflicting performance conclusions.",
    181           "source": "haiku"
    182         },
    183         "quality_assessment_of_sources": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Table 3 presents a five-criterion Quality Assessment Criteria (QAC) framework evaluated on a 0-3 Likert scale with a minimum threshold of 12/15 points; preprints are assessed separately on non-venue criteria.",
    187           "source": "haiku"
    188         },
    189         "publication_bias_discussed": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The paper does not discuss publication bias; it does not acknowledge that published LLM papers in this domain systematically favor reporting positive benchmark results, which likely skews the field's apparent progress.",
    193           "source": "haiku"
    194         },
    195         "quantitative_synthesis_present": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "Synthesis is purely descriptive with counts and percentages (e.g., 204 open-source usages = 53.3%); there is no formal meta-analysis, effect size aggregation, statistical comparison across approaches, or vote counting with confidence intervals.",
    199           "source": "haiku"
    200         },
    201         "recommendations_supported_by_evidence": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The three-stage roadmap in Section 8.2 derives each recommendation from specific limitations identified in RQ1-RQ4 (e.g., 'small-scale benchmarks <100 samples' → 'construct benchmarks exceeding 1,000 samples with PPA truth').",
    205           "source": "haiku"
    206         }
    207       }
    208     }
    209   },
    210   "claims": [
    211     {
    212       "claim": "LLM-based Verilog generation research grew from 1 paper in 2020 to 66 papers by September 2025, representing exponential growth.",
    213       "evidence": "Figure 4b shows exact annual counts: 1 (2020), 0 (2021), 0 (2022), 6 (2023), 29 (2024), 66 (2025).",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "GPT-series models dominate closed-source LLM adoption with 149 of 179 total mentions (83.2%) across surveyed papers.",
    218       "evidence": "Table 4 provides exact usage counts per LLM family by year; GPT: 149 total vs. Claude: 19, Gemini: 5, Others: 6.",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Open-source LLMs slightly outnumber closed-source in total usage across the surveyed literature (204 vs. 179 instances, 53.3% vs. 46.7%).",
    223       "evidence": "Table 4 aggregates total usages by open vs. closed source across all 102 papers, with Llama (64) and DeepSeek (54) leading open-source.",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "55.9% of domain-specific instruction-tuned LLMs for Verilog release public model weights.",
    228       "evidence": "Table 5 lists 34 IT LLMs, 19 open-weight and 15 closed-weight; 19/34 = 55.9%, with open-weight models linked to HuggingFace/GitHub repositories.",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Most instruct-tuning datasets lack executable testbench supervision, creating a gap between training and evaluation pipelines.",
    233       "evidence": "Tables 6 and 7 show nearly all benchmarks include testbenches (16/18 open), while only a few instruct-tuning datasets (VeriCoder-Origen, VeriReason-Data) include testbench supervision.",
    234       "supported": "strong"
    235     },
    236     {
    237       "claim": "Existing benchmarks are small-scale (<100 samples) and neglect system-level complexity such as pipelined datapaths and SoCs.",
    238       "evidence": "Section 8.1 makes this claim, but Table 6 shows CVDP (783 samples), VeriThoughts (291), and RTLRepo_test (1.17K), making the '<100 samples' generalization partially inaccurate.",
    239       "supported": "weak"
    240     },
    241     {
    242       "claim": "Functional-pass@k has become the dominant metric for evaluating Verilog code generation quality.",
    243       "evidence": "Section 5.2 RQ2 summary explicitly states 'functional-pass@k becoming the dominant indicator of correctness'; trend analysis confirms shift from similarity-based to execution-based metrics.",
    244       "supported": "moderate"
    245     }
    246   ],
    247   "methodology_tags": [
    248     "meta-analysis",
    249     "qualitative"
    250   ],
    251   "key_findings": "This systematic review of 102 papers (2020-2025) finds the LLM-based Verilog code generation field has grown explosively, from one paper in 2020 to 66 by September 2025, with GPT-series dominating closed-source adoption (83.2%) and Llama/DeepSeek leading open-source. Functional-pass@k has emerged as the dominant evaluation metric, displacing text similarity measures, but a critical training-evaluation gap persists because nearly all instruct-tuning datasets lack executable testbench supervision while evaluation benchmarks uniformly require it. The field faces four alignment challenges (security, efficiency/PPA, copyright, hallucinations) that remain largely unsolved, and no current system integrates into industrial EDA workflows, leaving a significant gap between experimental prototypes and production-ready hardware design assistance.",
    252   "red_flags": [
    253     {
    254       "flag": "No publication bias acknowledgment",
    255       "detail": "The survey does not discuss the systematic tendency of LLM papers to report positive results, which likely inflates apparent progress across all reviewed techniques and benchmarks."
    256     },
    257     {
    258       "flag": "No conflicting findings identified",
    259       "detail": "The synthesis presents a unified narrative without identifying where reviewed papers contradict each other on methodology effectiveness, benchmark validity, or performance rankings across approaches."
    260     },
    261     {
    262       "flag": "No quantitative synthesis",
    263       "detail": "Despite reviewing 102 papers with reported performance numbers on shared benchmarks (VerilogEval, RTLLM), the synthesis is purely descriptive; no effect sizes, pooled estimates, or statistical comparisons across approaches are provided."
    264     },
    265     {
    266       "flag": "Benchmark characterization overstated",
    267       "detail": "Section 8.1 claims benchmarks are 'small-scale (<100 samples)' but Table 6 shows CVDP (783), VeriThoughts (291), and RTLRepo_test (1.17K) — the characterization is inaccurate for roughly a third of listed benchmarks."
    268     },
    269     {
    270       "flag": "Financial interests undeclared",
    271       "detail": "No competing interests statement is provided despite authors being affiliated with institutions that may have collaborations with commercial EDA vendors or LLM API providers mentioned in the survey."
    272     }
    273   ],
    274   "cited_papers": [
    275     {
    276       "title": "SEGRESS: Software Engineering Guidelines for REporting Secondary Studies (Kitchenham et al. 2023)",
    277       "relevance": "Methodological foundation for the systematic literature review protocol used in this paper"
    278     },
    279     {
    280       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation (Liu et al. 2023)",
    281       "relevance": "The primary benchmark for Verilog code generation evaluation, most widely used across the reviewed literature"
    282     },
    283     {
    284       "title": "VeriGen: A Large Language Model for Verilog Code Generation (Thakur et al. 2024)",
    285       "relevance": "Foundational early paper establishing benchmarks and fine-tuning baselines for the field"
    286     },
    287     {
    288       "title": "RTLCoder: Fully Open-Source and Efficient LLM-Assisted RTL Code Generation Technique (Liu et al. 2025)",
    289       "relevance": "Major open-source fine-tuning approach representing the state of the art in data-centric supervised training"
    290     },
    291     {
    292       "title": "CraftRTL: High-quality Synthetic Data Generation for Verilog Code Models (Liu et al. 2025, ICLR)",
    293       "relevance": "ICLR 2025 paper on correct-by-construction synthetic data, cited as key advance in training data quality"
    294     },
    295     {
    296       "title": "DAVE: Deriving Automatically Verilog from English (Pearce et al. 2020)",
    297       "relevance": "The first paper in the field, marking the inception of LLM-based Verilog generation research used to justify the 2020 scope boundary"
    298     },
    299     {
    300       "title": "Large Language Models for Software Engineering: A Systematic Literature Review (Hou et al. 2024)",
    301       "relevance": "Parallel SLR on general software engineering that this survey explicitly positions against in Table 1"
    302     },
    303     {
    304       "title": "VerilogCoder: Autonomous Verilog Coding Agents with Graph-based Planning and AST-based Waveform Tracing (Ho et al. 2025, AAAI)",
    305       "relevance": "AAAI 2025 multi-agent system representing the frontier of agentic Verilog generation with EDA-tool feedback"
    306     },
    307     {
    308       "title": "Identifying relevant studies in software engineering — QGS methodology (Zhang et al. 2011)",
    309       "relevance": "Provides the Quasi-Gold Standard search strategy used as the core methodology for study identification"
    310     },
    311     {
    312       "title": "DeepRTL: Bridging Verilog Understanding and Generation with a Unified Representation Model (Liu et al. 2025, ICLR)",
    313       "relevance": "ICLR 2025 knowledge-enhanced tuning approach combining understanding and generation tasks"
    314     }
    315   ],
    316   "engagement_factors": {
    317     "practical_relevance": {
    318       "score": 3,
    319       "justification": "Hardware engineers and AI researchers can directly use the taxonomies of 27 benchmarks, 34 training datasets, and categorized techniques to navigate and advance LLM-based Verilog design."
    320     },
    321     "surprise_contrarian": {
    322       "score": 1,
    323       "justification": "Mostly confirms expected trends (GPT dominance, open-source growth); the training-evaluation gap around testbench supervision is notable but not widely counterintuitive to the field."
    324     },
    325     "fear_safety": {
    326       "score": 1,
    327       "justification": "Section 7.1 discusses hardware trojans and security vulnerabilities in LLM-generated Verilog, but this is a secondary concern rather than the paper's central thesis."
    328     },
    329     "drama_conflict": {
    330       "score": 0,
    331       "justification": "Standard academic survey with no controversy, failed replications, or competing factions highlighted; the synthesis is consistently collaborative in tone."
    332     },
    333     "demo_ability": {
    334       "score": 0,
    335       "justification": "Pure literature survey with no tool, dataset, or interactive demo introduced; practitioners cannot immediately try anything from this paper alone."
    336     },
    337     "brand_recognition": {
    338       "score": 1,
    339       "justification": "David Lo (Singapore Management University) is a well-known software engineering researcher; the venue is ACM Computing Surveys (submitted), a prestigious venue, but no major industry lab is a co-author."
    340     }
    341   },
    342   "hn_data": {
    343     "threads": [
    344       {
    345         "hn_id": "33833631",
    346         "title": "Multi-scale gigapixel microscopy using a multi-camera array microscope",
    347         "points": 12,
    348         "comments": 1,
    349         "url": "https://news.ycombinator.com/item?id=33833631"
    350       },
    351       {
    352         "hn_id": "42277470",
    353         "title": "Stable, Fast, Automatic Learning Algorithm for Predictive Coding Networks [pdf]",
    354         "points": 3,
    355         "comments": 0,
    356         "url": "https://news.ycombinator.com/item?id=42277470"
    357       },
    358       {
    359         "hn_id": "45979786",
    360         "title": "Semi-Supervised Preference Optimization with Limited Feedback",
    361         "points": 2,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=45979786"
    364       },
    365       {
    366         "hn_id": "45966047",
    367         "title": "VRScout: Towards Real-Time, Autonomous Testing of Virtual Reality Games",
    368         "points": 2,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=45966047"
    371       },
    372       {
    373         "hn_id": "45917707",
    374         "title": "Probing Knowledge Holes in Unlearned LLMs",
    375         "points": 2,
    376         "comments": 0,
    377         "url": "https://news.ycombinator.com/item?id=45917707"
    378       },
    379       {
    380         "hn_id": "4448824",
    381         "title": "Letting daylight in: review reviewers and maximize transparency in science",
    382         "points": 1,
    383         "comments": 0,
    384         "url": "https://news.ycombinator.com/item?id=4448824"
    385       },
    386       {
    387         "hn_id": "46004769",
    388         "title": "Repetitive vs. Non-repetitive Lidar scanning pattern for roadside perception",
    389         "points": 1,
    390         "comments": 0,
    391         "url": "https://news.ycombinator.com/item?id=46004769"
    392       },
    393       {
    394         "hn_id": "45817907",
    395         "title": "Fix: Externalizing network I/O in serverless computing [pdf]",
    396         "points": 1,
    397         "comments": 0,
    398         "url": "https://news.ycombinator.com/item?id=45817907"
    399       },
    400       {
    401         "hn_id": "4095443",
    402         "title": "Modeling Barn Owls to make ultra precise localization systems",
    403         "points": 1,
    404         "comments": 0,
    405         "url": "https://news.ycombinator.com/item?id=4095443"
    406       },
    407       {
    408         "hn_id": "33984298",
    409         "title": "Paper – Educational Opportunities and Challenges of AI Code Generation",
    410         "points": 1,
    411         "comments": 0,
    412         "url": "https://news.ycombinator.com/item?id=33984298"
    413       }
    414     ],
    415     "top_points": 12,
    416     "total_points": 26,
    417     "total_comments": 1
    418   }
    419 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs