ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28422B)


      1 {
      2   "paper": {
      3     "title": "MetRex: A Benchmark for Verilog Code Metric Reasoning Using LLMs",
      4     "authors": [
      5       "Manar Abdelatty",
      6       "Jingxiao Ma",
      7       "Sherief Reda"
      8     ],
      9     "year": 2025,
     10     "venue": "30th Asia and South Pacific Design Automation Conference (ASPDAC '25)",
     11     "arxiv_id": "2411.03471",
     12     "doi": "10.1145/3658617.3697625"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "MetRex introduces a large-scale benchmark of 25,868 Verilog designs annotated with post-synthesis area, delay, and static power metrics. Supervised fine-tuning with Chain of Thought templates improves LLM estimation accuracy by 25-37% across metrics compared to few-shot prompting. Fine-tuned Llama3-8b achieves 73.2% accuracy within a 20% error margin for area estimation but struggles with complex Level-3 designs. LLMs outperform the regression-based MasterRTL by 17.4% within a 5% error margin while offering 1.7x speedup by eliminating feature extraction preprocessing.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides a GitHub link: https://github.com/scale-lab/MetRex (footnote 1, Section 1)."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The MetRex dataset of 25,868 designs is released via the same GitHub repository. The paper states the dataset is publicly available."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions hardware (A6000, A40, H100 GPUs) and 4-bit quantization, but provides no requirements.txt, Dockerfile, or detailed software environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are included in the paper. The paper describes the experimental setup but does not provide commands or scripts for replication."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 3 and 4 are reported as point estimates (e.g., '58.0%') with no confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims improvements (e.g., 'SFT boosts... by 37.0%') based solely on comparing point estimates. No statistical significance tests are applied."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports percentage improvements with baseline context (e.g., Table 4 shows both the baseline and fine-tuned acc@k values, allowing the reader to assess the magnitude of improvement)."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The test set contains 138 designs derived from VerilogEval. No justification is provided for why this sample size is sufficient, nor is any power analysis discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results are single-run point estimates."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against few-shot prompted (non-finetuned) LLMs (Table 3-4) and the regression-based MasterRTL model (Section 5.4, Fig. 4)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "MasterRTL (2023) is the state-of-the-art regression-based approach for RTL metric estimation. Mixtral-8x7b and Llama3-8b are recent open-source models."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 3 ablates the Chain of Thought (CoT) component, comparing performance with and without CoT prompting. The paper also varies LoRA rank (128 vs 256) in Section 5.4."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses both MRE (Mean Relative Error, Eq. 1) and acc@k at multiple k values (1, 5, 10) and error margins (10%, 20%), applied across three metrics (area, delay, static power)."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation is not relevant here. The ground truth comes from EDA synthesis tools, providing objective numerical targets for area, delay, and power."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The training set (25,868 designs from RTL-Coder, VeriGen, etc.) is distinct from the test set (138 designs derived from VerilogEval benchmark), as stated in Section 5.1 and Table 2."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 2 categorizes the test set by difficulty level (L1, L2, L3), and Fig. 4 shows per-level performance comparisons between LLM and MasterRTL."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.4 discusses that the model 'underperforms in level-3 primarily due to the increased reasoning complexity' and has 'higher sensitivity to variations in code design and susceptibility to generate extreme outliers.'"
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract acknowledges SFT 'remains far from achieving optimal results, especially on complex problems.' Section 5.4 reports that MasterRTL outperforms the LLM under relaxed error margins (20%) and on Level-3 designs."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims of SFT improvements (37.0%, 25.3%, 25.7%) are supported by Table 4 (acc@1 deltas averaged across models). The 17.4% improvement over regression and 1.7x speedup are supported by Section 5.4 and Fig. 5."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper claims 'SFT boosts reasoning capabilities' and 'CoT prompting enhanced performance.' These causal claims are supported by controlled comparisons: CoT vs non-CoT (Table 3, same models/data) and SFT vs ICL (Table 4, same evaluation set)."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper scopes its claims to Verilog HDL, post-synthesis metrics, and specific technologies (Skywater 130nm, TSMC 65nm). Section 6 explicitly acknowledges the focus on 'self-contained and relatively small-scale designs, due to the limited fine-tuning context window.'"
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the SFT improvements stem from memorization of gate-level patterns rather than genuine reasoning, or whether the CoT gains are due to format compliance rather than deeper understanding."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper's claims match the granularity of its measurements. It measures MRE and acc@k on post-synthesis area/delay/power against EDA tool ground truth, and frames results in terms of 'metric estimation accuracy' without overclaiming broader capabilities."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper refers to 'Mixtral-8x7b' and 'Llama3-8b' without specifying exact checkpoints, variant (base vs instruct), or snapshot dates. These model families have multiple versions."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Figure 2 shows the CoT template structure with a single example (full adder). However, the actual few-shot prompt text (the 10 examples), the instruction format used for SFT, and the system prompt are not fully provided."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The paper reports temperature (0 for ICL, 0.4 for SFT evaluation), LoRA rank (128), 4-bit quantization, and max sequence length (1048 tokens) in Section 5.3."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "The evaluation does not use agentic scaffolding. The LLM agent mentioned in Section 4.1 is for data cleaning only, not for the main evaluation."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.1 documents the cleaning process: removing duplicates, filtering non-synthesizable elements (test benches, gate-level netlists), rectifying errors via an automated LLM agent + synthesis tool loop, and the synthesis flow using Yosys and OpenSTA."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 ('Discussion and Future Work') contains substantive discussion of limitations including design complexity constraints, missing switching power analysis, and fixed technology assumptions."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 identifies specific limitations: the dataset focuses on 'self-contained and relatively small-scale designs, due to the limited fine-tuning context window,' switching power is excluded because it 'requires propagating the activity factor through the logic gates,' and only fixed technology node and synthesis strategy are used."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6 explicitly states what was NOT tested: larger/complex designs, switching power, different synthesis strategies, and different technology nodes. The paper frames these as future work."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The MetRex dataset is released via GitHub (https://github.com/scale-lab/MetRex), containing the Verilog designs and their post-synthesis metrics."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.1 describes data collection from six sources (RTL-Coder, VeriGen, ISCAS'89, ISCAS'85, OpenCores, NVLDA), with design counts per source in Table 1 and the synthesis methodology using Yosys and OpenSTA."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants are involved. Data sources are standard public datasets and benchmarks."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 4.1 documents the full pipeline: collection from sources → duplicate removal → filtering non-synthesizable elements → automated error fixing via LLM+compiler loop → synthesis with Yosys → metric extraction via OpenSTA → CoT template generation (Section 4.2). Table 1 shows final design counts per source."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The acknowledgments section states: 'This work is supported by NSF grant 2350180.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All three authors are from Brown University School of Engineering. They are not evaluating a product from their own company."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "NSF is a government funding agency with no financial stake in the outcome of this research."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper does not state the training data cutoff dates for Mixtral-8x7b or Llama3-8b, which is necessary to assess whether the VerilogEval test set could have been seen during pre-training."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether VerilogEval test designs or similar Verilog code appeared in the pre-training data of Mixtral or Llama3."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "VerilogEval was published in 2023. Both Mixtral and Llama3 could have been trained on data containing VerilogEval designs or their solutions, but this contamination risk is not discussed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants involved. This is a benchmark evaluation study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants involved."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants involved."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants involved."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants involved."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants involved."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants involved."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Figure 5 shows runtime comparison between Llama3-MetRex-8b and MasterRTL on an H100 GPU, demonstrating 1.7x speedup. Total runtimes are quantified (e.g., MasterRTL preprocessing at 505.3 seconds, model inference at 8.5 seconds)."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper mentions using a single A40 GPU for fine-tuning and a single A6000 GPU for ICL, but does not report total training time, GPU hours, or compute budget."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No multi-seed experiments are reported. Results appear to be from single training runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper does not explicitly state how many training runs produced the results. The acc@k metric samples multiple predictions per design but the number of independent experimental runs is not stated."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "LoRA rank values of 128 and 256 are used, but no hyperparameter search budget, method, or total configurations tried are reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper uses LoRA rank 128 for main experiments and 256 for one comparison (Section 5.4) without explaining how these values were selected or whether other configurations were tried."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple comparisons are made across models, metrics (area/delay/power), error margins (10%/20%), and k values, but no correction for multiple comparisons is applied."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors compare their fine-tuned models against MasterRTL (a third-party system) but do not discuss the bias of evaluating their own system on their own benchmark."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Figure 5 provides a runtime comparison between the LLM approach and MasterRTL, breaking down preprocessing and inference time, allowing comparison at matched runtime budgets."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether the VerilogEval-derived test set of 138 designs is representative of real-world hardware metric estimation tasks, or whether acc@k with MRE thresholds is the right measure of reasoning capability."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved in the evaluation. The LLMs are directly prompted or fine-tuned without agentic scaffolding."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Not discussed. VerilogEval was published in 2023, and both Mixtral and Llama3 could have seen VerilogEval content during pre-training."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. The few-shot examples and CoT template could leak structural information about the expected output format, but this is not analyzed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The training set includes VeriGen (GitHub-scraped Verilog) and the test set is from VerilogEval (also based on Verilog problems). Potential overlap between these sources is not discussed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention methods are applied (no deduplication between training and pre-training data, no canary strings, no membership inference)."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Supervised Fine-Tuning boosts LLM reasoning capabilities on average by 37.0%, 25.3%, and 25.7% on area, delay, and static power respectively.",
    369       "evidence": "Table 4 shows acc@1 improvements for both Mixtral and Llama3 models after SFT compared to ICL baselines. E.g., Llama3-8b area acc@1 improves from 17.4% to 58.0% (+40.6%).",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Chain of Thought prompting enhances ICL performance on average by 5.1%, 5.4%, and 8.9% on area, delay, and static power.",
    374       "evidence": "Table 3 shows acc@5 improvements when using CoT vs direct prompting for both Mixtral and Llama3 models under 10% and 20% error margins.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "LLMs improve the rate of obtaining accurate estimates within a 5% error margin by 17.4% compared to MasterRTL.",
    379       "evidence": "Section 5.4 and Fig. 4 show per-level comparisons between Llama3-MetRex-8b and MasterRTL at 5%, 10%, and 20% error margins.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "LLMs offer a 1.7x speedup by eliminating the need for preprocessing.",
    384       "evidence": "Figure 5 shows runtime comparison on H100 GPU. MasterRTL total time is dominated by SOG generation and feature extraction (505.3s), while LLM inference is faster overall.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Fine-tuned Llama3-MetRex-8b achieves accuracy rates of 73.2%, 61.6%, and 52.2% for area, delay, and static power within a 20% error margin.",
    389       "evidence": "Table 4 shows these acc@1 values for the fine-tuned Llama3-MetRex-8b model at the 20% MRE threshold.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Training data dominated by LLM-generated code",
    396       "detail": "71.3% of the training dataset (18,450 of 25,868 designs) comes from RTL-Coder, which contains GPT-generated Verilog. This could bias the model toward patterns in LLM-generated code rather than human-written designs, potentially inflating metrics on the test set which also includes programmatically-derived designs."
    397     },
    398     {
    399       "flag": "Small test set without uncertainty quantification",
    400       "detail": "The test set contains only 138 designs. Per-level subsets are even smaller (L1: 23, L2: 43, L3: 72). No confidence intervals, error bars, or statistical tests are reported, making it impossible to assess whether observed differences are statistically significant."
    401     },
    402     {
    403       "flag": "No contamination analysis",
    404       "detail": "Both Mixtral and Llama3 could have seen VerilogEval designs (published 2023) during pre-training. Additionally, the training data sources (VeriGen from GitHub) could overlap with VerilogEval. No deduplication or contamination analysis is performed."
    405     },
    406     {
    407       "flag": "Selective comparison with MasterRTL",
    408       "detail": "The comparison with MasterRTL in Fig. 4 shows the LLM is better at 5% margin but MasterRTL is better at 20% margin and on complex designs. The abstract and conclusion emphasize the 5% and 17.4% advantage while downplaying the regression model's strengths on harder problems."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Benchmarking large language models for automated verilog rtl code generation",
    414       "authors": ["Shailja Thakur", "Baleegh Ahmad", "Zhenxing Fan", "Hammond Pearce", "Benjamin Tan", "Ramesh Karri", "Brendan Dolan-Gavitt", "Siddharth Garg"],
    415       "year": 2023,
    416       "relevance": "Benchmarks LLMs on Verilog code generation, directly related to evaluating LLM capabilities on hardware design tasks."
    417     },
    418     {
    419       "title": "Invited paper: Verilogeval: Evaluating large language models for verilog code generation",
    420       "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"],
    421       "year": 2023,
    422       "relevance": "Provides the VerilogEval benchmark used as the test set in this paper; key benchmark for LLM-based Verilog generation evaluation."
    423     },
    424     {
    425       "title": "RTLLM: An open-source benchmark for design RTL generation with large language model",
    426       "authors": ["Yao Lu", "Shang Liu", "Qijun Zhang", "Zhiyao Xie"],
    427       "year": 2024,
    428       "relevance": "Another benchmark for evaluating LLMs on RTL code generation, relevant to understanding LLM capabilities in hardware design."
    429     },
    430     {
    431       "title": "RTLFixer: Automatically fixing RTL syntax errors with large language models",
    432       "authors": ["YunDa Tsai", "Mingjie Liu", "Haoxing Ren"],
    433       "year": 2023,
    434       "arxiv_id": "2311.16543",
    435       "relevance": "Uses LLMs for automated RTL bug fixing; the automated cleaning flow in MetRex is inspired by this approach."
    436     },
    437     {
    438       "title": "Evaluating large language models trained on code",
    439       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    440       "year": 2021,
    441       "arxiv_id": "2107.03374",
    442       "relevance": "Introduces the pass@k metric and Codex evaluation; foundational work for evaluating LLM code generation capabilities."
    443     },
    444     {
    445       "title": "CruxEval: A benchmark for code reasoning, understanding and execution",
    446       "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather", "Armando Solar-Lezama", "Gabriel Synnaeve", "Sida I Wang"],
    447       "year": 2024,
    448       "arxiv_id": "2401.03065",
    449       "relevance": "Benchmarks LLM code reasoning and execution capabilities, closely related to the code metric reasoning task addressed here."
    450     },
    451     {
    452       "title": "ChatEDA: A large language model powered autonomous agent for EDA",
    453       "authors": ["Haoyuan Wu", "Zhuolun He", "Xinyun Zhang", "Xufeng Yao", "Su Zheng", "Haisheng Zheng", "Bei Yu"],
    454       "year": 2024,
    455       "relevance": "Demonstrates LLM-powered autonomous agent for electronic design automation, relevant to agentic AI in hardware design."
    456     },
    457     {
    458       "title": "MasterRTL: A pre-synthesis PPA estimation framework for any RTL design",
    459       "authors": ["Wenji Fang", "Yao Lu", "Shang Liu", "Qijun Zhang", "Ceyu Xu", "Lisa Wu Wills", "Hongce Zhang", "Zhiyao Xie"],
    460       "year": 2023,
    461       "relevance": "The primary regression-based baseline used for comparison; state-of-the-art ML approach for RTL metric estimation."
    462     },
    463     {
    464       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    465       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    466       "year": 2022,
    467       "relevance": "Foundational work on chain-of-thought prompting that MetRex builds upon for its CoT template design."
    468     },
    469     {
    470       "title": "RTLCoder: Outperforming GPT-3.5 in design RTL generation with our open-source dataset and lightweight solution",
    471       "authors": ["Shang Liu", "Wenji Fang", "Yao Lu", "Qijun Zhang", "Hongce Zhang", "Zhiyao Xie"],
    472       "year": 2024,
    473       "relevance": "Source of the largest portion of MetRex training data (18,450 LLM-generated Verilog designs); evaluates fine-tuned LLMs for RTL generation."
    474     },
    475     {
    476       "title": "LoRA: Low-rank adaptation of large language models",
    477       "authors": ["Edward J Hu", "Phillip Wallis", "Zeyuan Allen-Zhu"],
    478       "year": 2021,
    479       "relevance": "The parameter-efficient fine-tuning technique used in all MetRex SFT experiments."
    480     },
    481     {
    482       "title": "Meta large language model compiler: Foundation models of compiler optimization",
    483       "authors": ["Chris Cummins", "Volker Seeker", "Dejan Grubisic", "Baptiste Roziere", "Jonas Gehring", "Gabriel Synnaeve", "Hugh Leather"],
    484       "year": 2024,
    485       "arxiv_id": "2407.02524",
    486       "relevance": "Applies LLMs to compiler optimization, demonstrating LLM capabilities in code reasoning and optimization tasks."
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 1,
    492       "justification": "Hardware designers could potentially use this for early metric estimation, but accuracy is limited and the approach requires fine-tuning."
    493     },
    494     "surprise_contrarian": {
    495       "score": 1,
    496       "justification": "Novel application of LLMs to Verilog metric estimation, but the finding that LLMs can estimate numerical properties with CoT is not deeply surprising."
    497     },
    498     "fear_safety": {
    499       "score": 0,
    500       "justification": "No safety or security implications; purely a hardware design productivity tool."
    501     },
    502     "drama_conflict": {
    503       "score": 0,
    504       "justification": "No controversy; straightforward benchmark introduction paper."
    505     },
    506     "demo_ability": {
    507       "score": 2,
    508       "justification": "GitHub repository with dataset and code is available; a researcher with GPU access could replicate the fine-tuning experiments."
    509     },
    510     "brand_recognition": {
    511       "score": 0,
    512       "justification": "Brown University is respected but not a high-profile AI lab; no major brand products involved."
    513     }
    514   }
    515 }

Impressum · Datenschutz