scan.json (28422B)
1 { 2 "paper": { 3 "title": "MetRex: A Benchmark for Verilog Code Metric Reasoning Using LLMs", 4 "authors": [ 5 "Manar Abdelatty", 6 "Jingxiao Ma", 7 "Sherief Reda" 8 ], 9 "year": 2025, 10 "venue": "30th Asia and South Pacific Design Automation Conference (ASPDAC '25)", 11 "arxiv_id": "2411.03471", 12 "doi": "10.1145/3658617.3697625" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "MetRex introduces a large-scale benchmark of 25,868 Verilog designs annotated with post-synthesis area, delay, and static power metrics. Supervised fine-tuning with Chain of Thought templates improves LLM estimation accuracy by 25-37% across metrics compared to few-shot prompting. Fine-tuned Llama3-8b achieves 73.2% accuracy within a 20% error margin for area estimation but struggles with complex Level-3 designs. LLMs outperform the regression-based MasterRTL by 17.4% within a 5% error margin while offering 1.7x speedup by eliminating feature extraction preprocessing.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper provides a GitHub link: https://github.com/scale-lab/MetRex (footnote 1, Section 1)." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The MetRex dataset of 25,868 designs is released via the same GitHub repository. The paper states the dataset is publicly available." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions hardware (A6000, A40, H100 GPUs) and 4-bit quantization, but provides no requirements.txt, Dockerfile, or detailed software environment specification with library versions." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included in the paper. The paper describes the experimental setup but does not provide commands or scripts for replication." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Tables 3 and 4 are reported as point estimates (e.g., '58.0%') with no confidence intervals or error bars." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims improvements (e.g., 'SFT boosts... by 37.0%') based solely on comparing point estimates. No statistical significance tests are applied." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports percentage improvements with baseline context (e.g., Table 4 shows both the baseline and fine-tuned acc@k values, allowing the reader to assess the magnitude of improvement)." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The test set contains 138 designs derived from VerilogEval. No justification is provided for why this sample size is sufficient, nor is any power analysis discussed." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results are single-run point estimates." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper compares against few-shot prompted (non-finetuned) LLMs (Table 3-4) and the regression-based MasterRTL model (Section 5.4, Fig. 4)." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "MasterRTL (2023) is the state-of-the-art regression-based approach for RTL metric estimation. Mixtral-8x7b and Llama3-8b are recent open-source models." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Table 3 ablates the Chain of Thought (CoT) component, comparing performance with and without CoT prompting. The paper also varies LoRA rank (128 vs 256) in Section 5.4." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses both MRE (Mean Relative Error, Eq. 1) and acc@k at multiple k values (1, 5, 10) and error margins (10%, 20%), applied across three metrics (area, delay, static power)." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "Human evaluation is not relevant here. The ground truth comes from EDA synthesis tools, providing objective numerical targets for area, delay, and power." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The training set (25,868 designs from RTL-Coder, VeriGen, etc.) is distinct from the test set (138 designs derived from VerilogEval benchmark), as stated in Section 5.1 and Table 2." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 2 categorizes the test set by difficulty level (L1, L2, L3), and Fig. 4 shows per-level performance comparisons between LLM and MasterRTL." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 5.4 discusses that the model 'underperforms in level-3 primarily due to the increased reasoning complexity' and has 'higher sensitivity to variations in code design and susceptibility to generate extreme outliers.'" 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The abstract acknowledges SFT 'remains far from achieving optimal results, especially on complex problems.' Section 5.4 reports that MasterRTL outperforms the LLM under relaxed error margins (20%) and on Level-3 designs." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims of SFT improvements (37.0%, 25.3%, 25.7%) are supported by Table 4 (acc@1 deltas averaged across models). The 17.4% improvement over regression and 1.7x speedup are supported by Section 5.4 and Fig. 5." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper claims 'SFT boosts reasoning capabilities' and 'CoT prompting enhanced performance.' These causal claims are supported by controlled comparisons: CoT vs non-CoT (Table 3, same models/data) and SFT vs ICL (Table 4, same evaluation set)." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper scopes its claims to Verilog HDL, post-synthesis metrics, and specific technologies (Skywater 130nm, TSMC 65nm). Section 6 explicitly acknowledges the focus on 'self-contained and relatively small-scale designs, due to the limited fine-tuning context window.'" 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the SFT improvements stem from memorization of gate-level patterns rather than genuine reasoning, or whether the CoT gains are due to format compliance rather than deeper understanding." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper's claims match the granularity of its measurements. It measures MRE and acc@k on post-synthesis area/delay/power against EDA tool ground truth, and frames results in terms of 'metric estimation accuracy' without overclaiming broader capabilities." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper refers to 'Mixtral-8x7b' and 'Llama3-8b' without specifying exact checkpoints, variant (base vs instruct), or snapshot dates. These model families have multiple versions." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": false, 151 "justification": "Figure 2 shows the CoT template structure with a single example (full adder). However, the actual few-shot prompt text (the 10 examples), the instruction format used for SFT, and the system prompt are not fully provided." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "The paper reports temperature (0 for ICL, 0.4 for SFT evaluation), LoRA rank (128), 4-bit quantization, and max sequence length (1048 tokens) in Section 5.3." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "The evaluation does not use agentic scaffolding. The LLM agent mentioned in Section 4.1 is for data cleaning only, not for the main evaluation." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 4.1 documents the cleaning process: removing duplicates, filtering non-synthesizable elements (test benches, gate-level netlists), rectifying errors via an automated LLM agent + synthesis tool loop, and the synthesis flow using Yosys and OpenSTA." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 6 ('Discussion and Future Work') contains substantive discussion of limitations including design complexity constraints, missing switching power analysis, and fixed technology assumptions." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 6 identifies specific limitations: the dataset focuses on 'self-contained and relatively small-scale designs, due to the limited fine-tuning context window,' switching power is excluded because it 'requires propagating the activity factor through the logic gates,' and only fixed technology node and synthesis strategy are used." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 6 explicitly states what was NOT tested: larger/complex designs, switching power, different synthesis strategies, and different technology nodes. The paper frames these as future work." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The MetRex dataset is released via GitHub (https://github.com/scale-lab/MetRex), containing the Verilog designs and their post-synthesis metrics." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.1 describes data collection from six sources (RTL-Coder, VeriGen, ISCAS'89, ISCAS'85, OpenCores, NVLDA), with design counts per source in Table 1 and the synthesis methodology using Yosys and OpenSTA." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants are involved. Data sources are standard public datasets and benchmarks." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "Section 4.1 documents the full pipeline: collection from sources → duplicate removal → filtering non-synthesizable elements → automated error fixing via LLM+compiler loop → synthesis with Yosys → metric extraction via OpenSTA → CoT template generation (Section 4.2). Table 1 shows final design counts per source." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "The acknowledgments section states: 'This work is supported by NSF grant 2350180.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All three authors are from Brown University School of Engineering. They are not evaluating a product from their own company." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "NSF is a government funding agency with no financial stake in the outcome of this research." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper does not state the training data cutoff dates for Mixtral-8x7b or Llama3-8b, which is necessary to assess whether the VerilogEval test set could have been seen during pre-training." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether VerilogEval test designs or similar Verilog code appeared in the pre-training data of Mixtral or Llama3." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "VerilogEval was published in 2023. Both Mixtral and Llama3 could have been trained on data containing VerilogEval designs or their solutions, but this contamination risk is not discussed." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants involved. This is a benchmark evaluation study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants involved." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants involved." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants involved." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants involved." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants involved." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants involved." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Figure 5 shows runtime comparison between Llama3-MetRex-8b and MasterRTL on an H100 GPU, demonstrating 1.7x speedup. Total runtimes are quantified (e.g., MasterRTL preprocessing at 505.3 seconds, model inference at 8.5 seconds)." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "The paper mentions using a single A40 GPU for fine-tuning and a single A6000 GPU for ICL, but does not report total training time, GPU hours, or compute budget." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No multi-seed experiments are reported. Results appear to be from single training runs." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The paper does not explicitly state how many training runs produced the results. The acc@k metric samples multiple predictions per design but the number of independent experimental runs is not stated." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "LoRA rank values of 128 and 256 are used, but no hyperparameter search budget, method, or total configurations tried are reported." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper uses LoRA rank 128 for main experiments and 256 for one comparison (Section 5.4) without explaining how these values were selected or whether other configurations were tried." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Multiple comparisons are made across models, metrics (area/delay/power), error margins (10%/20%), and k values, but no correction for multiple comparisons is applied." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors compare their fine-tuned models against MasterRTL (a third-party system) but do not discuss the bias of evaluating their own system on their own benchmark." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": true, 330 "justification": "Figure 5 provides a runtime comparison between the LLM approach and MasterRTL, breaking down preprocessing and inference time, allowing comparison at matched runtime budgets." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper does not discuss whether the VerilogEval-derived test set of 138 designs is representative of real-world hardware metric estimation tasks, or whether acc@k with MRE thresholds is the right measure of reasoning capability." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved in the evaluation. The LLMs are directly prompted or fine-tuned without agentic scaffolding." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Not discussed. VerilogEval was published in 2023, and both Mixtral and Llama3 could have seen VerilogEval content during pre-training." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed. The few-shot examples and CoT template could leak structural information about the expected output format, but this is not analyzed." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The training set includes VeriGen (GitHub-scraped Verilog) and the test set is from VerilogEval (also based on Verilog problems). Potential overlap between these sources is not discussed." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection or prevention methods are applied (no deduplication between training and pre-training data, no canary strings, no membership inference)." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "Supervised Fine-Tuning boosts LLM reasoning capabilities on average by 37.0%, 25.3%, and 25.7% on area, delay, and static power respectively.", 369 "evidence": "Table 4 shows acc@1 improvements for both Mixtral and Llama3 models after SFT compared to ICL baselines. E.g., Llama3-8b area acc@1 improves from 17.4% to 58.0% (+40.6%).", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Chain of Thought prompting enhances ICL performance on average by 5.1%, 5.4%, and 8.9% on area, delay, and static power.", 374 "evidence": "Table 3 shows acc@5 improvements when using CoT vs direct prompting for both Mixtral and Llama3 models under 10% and 20% error margins.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "LLMs improve the rate of obtaining accurate estimates within a 5% error margin by 17.4% compared to MasterRTL.", 379 "evidence": "Section 5.4 and Fig. 4 show per-level comparisons between Llama3-MetRex-8b and MasterRTL at 5%, 10%, and 20% error margins.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "LLMs offer a 1.7x speedup by eliminating the need for preprocessing.", 384 "evidence": "Figure 5 shows runtime comparison on H100 GPU. MasterRTL total time is dominated by SOG generation and feature extraction (505.3s), while LLM inference is faster overall.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Fine-tuned Llama3-MetRex-8b achieves accuracy rates of 73.2%, 61.6%, and 52.2% for area, delay, and static power within a 20% error margin.", 389 "evidence": "Table 4 shows these acc@1 values for the fine-tuned Llama3-MetRex-8b model at the 20% MRE threshold.", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Training data dominated by LLM-generated code", 396 "detail": "71.3% of the training dataset (18,450 of 25,868 designs) comes from RTL-Coder, which contains GPT-generated Verilog. This could bias the model toward patterns in LLM-generated code rather than human-written designs, potentially inflating metrics on the test set which also includes programmatically-derived designs." 397 }, 398 { 399 "flag": "Small test set without uncertainty quantification", 400 "detail": "The test set contains only 138 designs. Per-level subsets are even smaller (L1: 23, L2: 43, L3: 72). No confidence intervals, error bars, or statistical tests are reported, making it impossible to assess whether observed differences are statistically significant." 401 }, 402 { 403 "flag": "No contamination analysis", 404 "detail": "Both Mixtral and Llama3 could have seen VerilogEval designs (published 2023) during pre-training. Additionally, the training data sources (VeriGen from GitHub) could overlap with VerilogEval. No deduplication or contamination analysis is performed." 405 }, 406 { 407 "flag": "Selective comparison with MasterRTL", 408 "detail": "The comparison with MasterRTL in Fig. 4 shows the LLM is better at 5% margin but MasterRTL is better at 20% margin and on complex designs. The abstract and conclusion emphasize the 5% and 17.4% advantage while downplaying the regression model's strengths on harder problems." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Benchmarking large language models for automated verilog rtl code generation", 414 "authors": ["Shailja Thakur", "Baleegh Ahmad", "Zhenxing Fan", "Hammond Pearce", "Benjamin Tan", "Ramesh Karri", "Brendan Dolan-Gavitt", "Siddharth Garg"], 415 "year": 2023, 416 "relevance": "Benchmarks LLMs on Verilog code generation, directly related to evaluating LLM capabilities on hardware design tasks." 417 }, 418 { 419 "title": "Invited paper: Verilogeval: Evaluating large language models for verilog code generation", 420 "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"], 421 "year": 2023, 422 "relevance": "Provides the VerilogEval benchmark used as the test set in this paper; key benchmark for LLM-based Verilog generation evaluation." 423 }, 424 { 425 "title": "RTLLM: An open-source benchmark for design RTL generation with large language model", 426 "authors": ["Yao Lu", "Shang Liu", "Qijun Zhang", "Zhiyao Xie"], 427 "year": 2024, 428 "relevance": "Another benchmark for evaluating LLMs on RTL code generation, relevant to understanding LLM capabilities in hardware design." 429 }, 430 { 431 "title": "RTLFixer: Automatically fixing RTL syntax errors with large language models", 432 "authors": ["YunDa Tsai", "Mingjie Liu", "Haoxing Ren"], 433 "year": 2023, 434 "arxiv_id": "2311.16543", 435 "relevance": "Uses LLMs for automated RTL bug fixing; the automated cleaning flow in MetRex is inspired by this approach." 436 }, 437 { 438 "title": "Evaluating large language models trained on code", 439 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 440 "year": 2021, 441 "arxiv_id": "2107.03374", 442 "relevance": "Introduces the pass@k metric and Codex evaluation; foundational work for evaluating LLM code generation capabilities." 443 }, 444 { 445 "title": "CruxEval: A benchmark for code reasoning, understanding and execution", 446 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather", "Armando Solar-Lezama", "Gabriel Synnaeve", "Sida I Wang"], 447 "year": 2024, 448 "arxiv_id": "2401.03065", 449 "relevance": "Benchmarks LLM code reasoning and execution capabilities, closely related to the code metric reasoning task addressed here." 450 }, 451 { 452 "title": "ChatEDA: A large language model powered autonomous agent for EDA", 453 "authors": ["Haoyuan Wu", "Zhuolun He", "Xinyun Zhang", "Xufeng Yao", "Su Zheng", "Haisheng Zheng", "Bei Yu"], 454 "year": 2024, 455 "relevance": "Demonstrates LLM-powered autonomous agent for electronic design automation, relevant to agentic AI in hardware design." 456 }, 457 { 458 "title": "MasterRTL: A pre-synthesis PPA estimation framework for any RTL design", 459 "authors": ["Wenji Fang", "Yao Lu", "Shang Liu", "Qijun Zhang", "Ceyu Xu", "Lisa Wu Wills", "Hongce Zhang", "Zhiyao Xie"], 460 "year": 2023, 461 "relevance": "The primary regression-based baseline used for comparison; state-of-the-art ML approach for RTL metric estimation." 462 }, 463 { 464 "title": "Chain-of-thought prompting elicits reasoning in large language models", 465 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 466 "year": 2022, 467 "relevance": "Foundational work on chain-of-thought prompting that MetRex builds upon for its CoT template design." 468 }, 469 { 470 "title": "RTLCoder: Outperforming GPT-3.5 in design RTL generation with our open-source dataset and lightweight solution", 471 "authors": ["Shang Liu", "Wenji Fang", "Yao Lu", "Qijun Zhang", "Hongce Zhang", "Zhiyao Xie"], 472 "year": 2024, 473 "relevance": "Source of the largest portion of MetRex training data (18,450 LLM-generated Verilog designs); evaluates fine-tuned LLMs for RTL generation." 474 }, 475 { 476 "title": "LoRA: Low-rank adaptation of large language models", 477 "authors": ["Edward J Hu", "Phillip Wallis", "Zeyuan Allen-Zhu"], 478 "year": 2021, 479 "relevance": "The parameter-efficient fine-tuning technique used in all MetRex SFT experiments." 480 }, 481 { 482 "title": "Meta large language model compiler: Foundation models of compiler optimization", 483 "authors": ["Chris Cummins", "Volker Seeker", "Dejan Grubisic", "Baptiste Roziere", "Jonas Gehring", "Gabriel Synnaeve", "Hugh Leather"], 484 "year": 2024, 485 "arxiv_id": "2407.02524", 486 "relevance": "Applies LLMs to compiler optimization, demonstrating LLM capabilities in code reasoning and optimization tasks." 487 } 488 ], 489 "engagement_factors": { 490 "practical_relevance": { 491 "score": 1, 492 "justification": "Hardware designers could potentially use this for early metric estimation, but accuracy is limited and the approach requires fine-tuning." 493 }, 494 "surprise_contrarian": { 495 "score": 1, 496 "justification": "Novel application of LLMs to Verilog metric estimation, but the finding that LLMs can estimate numerical properties with CoT is not deeply surprising." 497 }, 498 "fear_safety": { 499 "score": 0, 500 "justification": "No safety or security implications; purely a hardware design productivity tool." 501 }, 502 "drama_conflict": { 503 "score": 0, 504 "justification": "No controversy; straightforward benchmark introduction paper." 505 }, 506 "demo_ability": { 507 "score": 2, 508 "justification": "GitHub repository with dataset and code is available; a researcher with GPU access could replicate the fine-tuning experiments." 509 }, 510 "brand_recognition": { 511 "score": 0, 512 "justification": "Brown University is respected but not a high-profile AI lab; no major brand products involved." 513 } 514 } 515 }