scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28531B)
      1 {
      2   "paper": {
      3     "title": "Spec2RTL-Agent: Automated Hardware Code Generation from Complex Specifications Using LLM Agent Systems",
      4     "authors": [
      5       "Zhongzhi Yu",
      6       "Mingjie Liu",
      7       "Michael Zimmer",
      8       "Yingyan (Celine) Lin",
      9       "Yong Liu",
     10       "Haoxing Ren"
     11     ],
     12     "year": 2025,
     13     "venue": "2025 IEEE International Conference on LLM-Aided Design (ICLAD)",
     14     "arxiv_id": "2506.13905",
     15     "doi": "10.1109/ICLAD65226.2025.00013"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "case-study"],
     20   "key_findings": "Spec2RTL-Agent is a multi-agent LLM system that generates RTL code from unstructured hardware specification documents via iterative understanding, progressive coding through multiple abstraction levels (pseudocode→Python→C++→HLS), and adaptive reflection. Evaluated on 3 NIST FIPS specifications (AES, DSS, HMAC), it achieves functional correctness with ~75% fewer human interventions compared to a human-guided baseline (~4.3 vs ~20 interventions). Ablation studies show each module (understanding, progressive coding, reflection) contributes to reducing interventions and coding iterations.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The benchmark uses publicly available NIST FIPS specification documents (AES FIPS 197, DSS FIPS 186-5, HMAC FIPS 198-1), which are freely accessible."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions GPT-4o and AutoGen framework but provides no environment specification, dependency list, or version information for the software stack."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No reproduction instructions, README, or runnable scripts are provided. The system description is at architecture level only."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results in Tables I-III are point estimates (e.g., 4.33 interventions, 9.11 coding iterations) with no confidence intervals or error bars."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims '75% fewer human interventions' and other comparative differences based solely on comparing raw numbers without any statistical tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports relative improvements with baseline context: '75% fewer human interventions' (from ~20 to 4.33), '51.9% reduction' vs W/o Reflection, and '31.0% reduction' in coding iterations vs Naive Coding (Section V-C)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The benchmark consists of only 3 specification documents. No justification is given for this sample size, nor any acknowledgment that N=3 is insufficient for statistical conclusions."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across runs. It is unclear whether experiments were repeated."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Table I compares against five baselines: Single-Shot, Human (mirroring Chip-Chat approach), W/o Understand, Naive Coding, and W/o Reflection."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No external contemporary systems are compared against. The baselines are ablations of the authors' own system plus a Single-Shot and Human baseline. Systems discussed in related work (VerilogCoder, ChatCPU, CraftRTL) are not evaluated, even partially."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section V-C provides ablation studies removing each module: W/o Understanding, Naive Coding (removes progressive coding), and W/o Reflection. Table I shows contributions of each component."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three metrics are used: Correct (functional correctness), # Intervention (human interventions required), and # Coding (code generation/revision attempts per sub-function)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation of generated code quality. Evaluation is entirely automated via HLS tool functional verification (pass/fail). No expert assessment of code readability, maintainability, or design quality."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "All 3 specification documents are used for both system development and evaluation. No separation between development/tuning and held-out test sets is described."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table II provides per-document breakdown (AES, DSS, HMAC) showing # Intervention and # Coding for each specification."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Fig. 6 visualizes two representative failure scenarios requiring human intervention: interface mismatch (Scenario 1) and inability to identify errors in previous sub-functions (Scenario 2). Section V-E discusses these cases."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The Single-Shot baseline fails all 3 test cases (0/3 correct). W/o Understanding fails 1/3. The paper reports that human intervention is still required in certain scenarios and discusses system limitations."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims '75% fewer human interventions' which is supported by Table I (~20 to 4.33, approximately 78% reduction). The claim of being the 'first fully automated multi-agent system for RTL generation from unstructured specification documents' is supported by the related work survey."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims are made through ablation studies (e.g., 'removing the iterative understanding module led to >4x increase in interventions'). The ablation design systematically removes single components, which is adequate for these claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Complex Specifications' generally, and the abstract says 'advancing LLM-based RTL code generation toward more realistic application settings.' But results are from only 3 NIST cryptographic specifications — a narrow domain. No bounding to cryptographic/NIST specs is stated."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations are discussed. The paper does not consider whether GPT-4o's pre-existing knowledge of AES/DSS/HMAC algorithms contributes to success, or whether the well-structured nature of NIST specifications makes them unrepresentative of general hardware specs."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper measures # Intervention and # Coding iterations and frames these as measuring 'automation' and 'productivity enhancement.' No discussion of whether fewer interventions actually translates to improved hardware design productivity, or whether the remaining interventions represent a bottleneck."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Section V-A states 'we use GPT-4o as the core architecture for all agents' but provides no snapshot date or API version. Per schema, marketing names like 'GPT-4o' without a version identifier do not count."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Fig. 5 shows one prompt example (for the Python coder) before and after optimization. However, the full set of prompts for the many agents (Summarization, Decomposer, Description, Verifier, Pseudo Coder, Python Coder, C++ Coder, Code Optimizer, Analysis, Reflection) is not provided."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for the GPT-4o API calls."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The multi-agent scaffolding is described in substantial detail in Section IV with Fig. 3 showing the architecture. Agent roles, interactions, workflow stages, reflection strategies, and error resolution paths are all documented."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section V-A describes preprocessing: 'we extract text using PyPDF and capture screenshots for figures and tables. All extracted data is compiled and fed into the LLM.' Also describes how code updates are handled with asterisk markers and rule-based replacement."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The conclusion (Section VI) contains two brief bullet points mentioning limitations (human intervention still required, ~10 iterations inefficiency), but there is no dedicated limitations section with substantive discussion."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No threats-to-validity discussion. The brief limitations in the conclusion are about future directions (reducing human intervention, improving efficiency), not specific threats to the validity of the current results."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. No acknowledgment that results are limited to cryptographic specifications, that N=3 is insufficient for generalization, or that GPT-4o's prior knowledge of these standards may inflate results."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data (agent logs, generated code, intermediate outputs, intervention records) is made available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The benchmark is clearly described: three FIPS documents (AES FIPS 197, DSS FIPS 186-5, HMAC FIPS 198-1) published by NIST, with specific publication references [40]-[42]."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. The data source is standard NIST specifications, a publicly available benchmark."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "While the system pipeline is described architecturally, the evaluation pipeline is not documented: how interventions were counted, what constituted a 'coding iteration,' how functional correctness was determined, and what specific test cases were used."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgment or disclosure appears in the paper, despite authors being from NVIDIA Research and Cadence (major companies in chip design)."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Zhongzhi Yu, Mingjie Liu, and Haoxing Ren at NVIDIA Research; Michael Zimmer and Yong Liu at Cadence; Yingyan (Celine) Lin at Georgia Institute of Technology."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding disclosed. However, NVIDIA and Cadence have direct commercial interests in hardware design automation tools. Cadence's Stratus HLS tool is used in the system and two authors are Cadence employees. Independence cannot be assessed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement appears in the paper. NVIDIA and Cadence are major chip design companies with potential commercial interest in automating hardware design."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "GPT-4o's training data cutoff is not stated. This is critical since the benchmark specifications (AES from 2001, DSS from 2023, HMAC from 2008) and their implementations are widely available online."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether GPT-4o has seen AES, DSS, or HMAC implementations during training. AES in particular has been implemented millions of times in public code repositories."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The FIPS specifications used as benchmarks have been publicly available for years (AES since 2001, HMAC since 2008, DSS since 2023). Countless implementations exist online. GPT-4o has almost certainly been trained on this material, yet contamination is not discussed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No API costs, token counts, or wall-clock time reported. Section VI mentions '~10 iterations per sub-function, which consumes a nontrivial number of tokens' but provides no quantification."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget stated. The multi-agent system with multiple GPT-4o calls across many iterations would have significant cost, but this is not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds or runs. LLM outputs are stochastic, but no seed sensitivity analysis is performed."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not stated. It is unclear whether results represent single runs or averages."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget reported. The many design choices (agent prompts, number of abstraction levels, iteration limits) appear tuned but no search process is documented."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No explanation of how the system configuration was selected. The choice of pseudocode→Python→C++ progression, agent interaction patterns, and other design decisions are not justified with alternatives tried."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so multiple comparison correction is inapplicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own system and implement all baselines (ablations of their system). No acknowledgment of self-comparison bias or independent evaluation."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Different configurations (full system vs. ablations) likely have very different compute costs, but performance is never reported as a function of compute budget."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether success on 3 NIST cryptographic specifications (which have extremely well-defined algorithms) measures the claimed capability of handling general 'complex specifications.'"
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "The scaffold (multi-agent system) IS the thing being tested. The model (GPT-4o) is constant across configurations. The paper evaluates its scaffolding system as a bundled product."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "Not discussed. AES (2001), HMAC (2008), and DSS (2023) specifications and their implementations existed long before GPT-4o's training. The model likely knows these algorithms from training data, meaning it may generate correct code from parametric knowledge rather than specification comprehension."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Not discussed. GPT-4o likely has extensive knowledge of AES/DSS/HMAC implementations from training, which could mean the specification document input is redundant — the model may generate correct implementations regardless of specification quality."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Not discussed. The 3 specifications are all from the same source (NIST FIPS) and same domain (cryptographic primitives), raising questions about independence and representativeness."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method used. A simple test would be giving the model an incorrect or scrambled specification to verify it actually reads and processes the spec rather than relying on prior knowledge."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Spec2RTL-Agent reduces human interventions by approximately 75% compared to the Human baseline approach",
    372       "evidence": "Table I shows ~20 interventions for Human baseline vs 4.33 for Spec2RTL-Agent across 3 FIPS specifications (Section V-B).",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Spec2RTL-Agent is the first fully automated multi-agent system for RTL generation from unstructured specification documents",
    377       "evidence": "Supported by the related work survey (Section II) which categorizes existing approaches as either simplified-input or human-in-the-loop. No prior system is identified that performs end-to-end spec-to-RTL generation.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Removing the iterative understanding and reasoning module leads to more than fourfold increase in human interventions",
    382       "evidence": "Table I: W/o Understanding requires 18.67 interventions vs 4.33 for full system (Section V-C).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Spec2RTL-Agent demonstrates strong resilience to generation noise across all implementation levels",
    387       "evidence": "Table III shows correct 3/3 across all noise injection points (understanding, pseudocode, Python, C++) with modest increases in interventions (4.33→5-6 range).",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "Generated AES implementation achieves comparable latency, throughput, and area to modified open-source AES solutions",
    392       "evidence": "Mentioned briefly in Section V-E ('Performance Evaluation') but no quantitative data is provided in the paper.",
    393       "supported": "weak"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "Extremely small benchmark (N=3)",
    399       "detail": "All conclusions are drawn from only 3 specification documents, all from the same domain (NIST cryptographic standards). This sample size is far too small for any statistical claims, and results cannot meaningfully generalize beyond this narrow domain."
    400     },
    401     {
    402       "flag": "Severe contamination risk",
    403       "detail": "AES (2001), HMAC (2008), and DSS (2023) are among the most widely implemented cryptographic standards. GPT-4o has almost certainly been trained on thousands of implementations. The system's success may reflect the model's parametric knowledge of these algorithms rather than its ability to process specifications. No contamination analysis is performed."
    404     },
    405     {
    406       "flag": "Company evaluating related product without disclosure",
    407       "detail": "Two authors are from Cadence, whose commercial Stratus HLS tool is used as a critical component. Three authors are from NVIDIA Research, which has commercial interests in hardware design automation. No conflicts of interest or funding are disclosed."
    408     },
    409     {
    410       "flag": "No variance or uncertainty quantification",
    411       "detail": "All results are point estimates with no error bars, confidence intervals, or multiple-run statistics. With stochastic LLM outputs and N=3, the reported numbers could vary substantially across runs."
    412     },
    413     {
    414       "flag": "No external baseline comparison",
    415       "detail": "Despite discussing VerilogCoder, ChatCPU, CraftRTL, and other systems in related work, no direct comparison is made against any external system. All baselines are ablations of the authors' own system."
    416     },
    417     {
    418       "flag": "Performance claim without data",
    419       "detail": "The paper claims generated AES code achieves 'comparable latency, throughput, and area' to open-source implementations but provides no quantitative performance data to support this claim."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    425       "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"],
    426       "year": 2023,
    427       "arxiv_id": "2309.07544",
    428       "relevance": "Benchmark for evaluating LLMs on Verilog code generation, one of the key prior works this paper builds upon."
    429     },
    430     {
    431       "title": "GPT4AIGChip: Towards Next-Generation AI Accelerator Design Automation via Large Language Models",
    432       "authors": ["Yonggan Fu", "Yongan Zhang", "Zhongzhi Yu"],
    433       "year": 2023,
    434       "arxiv_id": "2309.10730",
    435       "relevance": "LLM-based approach for AI chip design that automates sub-parts of the hardware design pipeline."
    436     },
    437     {
    438       "title": "CraftRTL: High-quality Synthetic Data Generation for Verilog Code Models with Correct-by-Construction Non-Textual Representations and Targeted Code Repair",
    439       "authors": ["Mingjie Liu"],
    440       "year": 2024,
    441       "arxiv_id": "2409.12993",
    442       "relevance": "Synthetic data generation for Verilog code models, addresses RTL code quality issues."
    443     },
    444     {
    445       "title": "VerilogCoder: Autonomous Verilog Coding Agents with Graph-Based Planning and Abstract Syntax Tree (AST)-Based Waveform Tracing Tool",
    446       "authors": ["Chia-Tung Ho", "Haoxing Ren", "Brucek Khailany"],
    447       "year": 2024,
    448       "arxiv_id": "2408.08927",
    449       "relevance": "Autonomous agent system for Verilog coding using graph-based planning, directly comparable to Spec2RTL-Agent's approach."
    450     },
    451     {
    452       "title": "Chip-Chat: Challenges and Opportunities in Conversational Hardware Design",
    453       "authors": ["Jason Blocklove", "Siddharth Garg", "Ramesh Karri", "Hammond Pearce"],
    454       "year": 2023,
    455       "arxiv_id": "2305.13243",
    456       "relevance": "Explores human-LLM collaboration for hardware design, serves as the 'Human' baseline approach in this paper."
    457     },
    458     {
    459       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    460       "authors": ["Qingyun Wu", "Gagan Bansal"],
    461       "year": 2024,
    462       "relevance": "Multi-agent framework used as the underlying orchestration platform for Spec2RTL-Agent."
    463     },
    464     {
    465       "title": "AgentCoder: Multi-Agent-Based Code Generation with Iterative Testing and Optimisation",
    466       "authors": ["Dong Huang"],
    467       "year": 2023,
    468       "arxiv_id": "2312.13010",
    469       "relevance": "Multi-agent code generation approach with iterative testing, directly relevant to agentic code generation methodology."
    470     },
    471     {
    472       "title": "RTLCoder: Fully Open-Source and Efficient LLM-Assisted RTL Code Generation Technique",
    473       "authors": ["Shang Liu"],
    474       "year": 2024,
    475       "relevance": "State-of-the-art open-source RTL code generation method, relevant baseline for LLM-based hardware code generation."
    476     },
    477     {
    478       "title": "ChipNeMo: Domain-Adapted LLMs for Chip Design",
    479       "authors": ["Mingjie Liu"],
    480       "year": 2023,
    481       "arxiv_id": "2311.00176",
    482       "relevance": "Domain-adapted LLMs specifically for chip design tasks, key prior work on applying LLMs to hardware design."
    483     },
    484     {
    485       "title": "Make Every Move Count: LLM-Based High-Quality RTL Code Generation Using MCTS",
    486       "authors": ["Marco DeLorenzo"],
    487       "year": 2024,
    488       "arxiv_id": "2402.03289",
    489       "relevance": "Combines Monte Carlo Tree Search with LLMs for RTL code generation, alternative approach to improving hardware code quality."
    490     },
    491     {
    492       "title": "AutoSafeCoder: A Multi-Agent Framework for Securing LLM Code Generation through Static Analysis and Fuzz Testing",
    493       "authors": ["Ana Nunez"],
    494       "year": 2024,
    495       "arxiv_id": "2409.10737",
    496       "relevance": "Multi-agent framework for secure code generation with automated testing, relevant to agent-based code quality assurance."
    497     },
    498     {
    499       "title": "ChatCPU: An Agile CPU Design and Verification Platform with LLM",
    500       "authors": ["Xiao Wang"],
    501       "year": 2024,
    502       "relevance": "Human-LLM collaboration platform for CPU design, one of the human-in-the-loop approaches this paper aims to improve upon."
    503     }
    504   ]
    505 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs