scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27540B)
      1 {
      2   "paper": {
      3     "title": "RTL++: Graph-enhanced LLM for RTL Code Generation",
      4     "authors": [
      5       "Mohammad Akyash",
      6       "Kimia Azar",
      7       "Hadi Kamali"
      8     ],
      9     "year": 2025,
     10     "venue": "2025 IEEE International Conference on LLM-Aided Design (ICLAD)",
     11     "arxiv_id": "2505.13479",
     12     "doi": "10.1109/ICLAD65226.2025.00020"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "RTL++ introduces a graph-augmented fine-tuning approach for LLM-based Verilog code generation, encoding RTL code as textualized control flow and data flow graphs alongside the code itself for instruction generation. On VerilogEval HumanEval, RTL++ (200K training set) achieves 59.9% pass@1, 68.8% pass@5, and 72.1% pass@10, outperforming GPT-4 and most baselines on pass@5/10, though it underperforms CraftRTL on pass@1. The ablation study shows graph-augmented instructions improve performance by ~5% at 5K scale, with claimed 18% improvement at 100K scale.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Footnote 1 links to a HuggingFace datasets URL (huggingface.co/datasets/makyash/RTL-PP) for the dataset/model, but no source code repository (training scripts, preprocessing code, evaluation pipeline) is provided. The conclusion also states 'we also plan to make RTL++ a fully open-source model,' indicating full release is future work."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Footnote 1 states 'Dataset/Model is available at [1]' with a HuggingFace URL (huggingface.co/datasets/makyash/RTL-PP). The evaluation uses public benchmarks VerilogEval and RTLLM."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section IV mentions PyTorch, NVIDIA L4, and LoRA, but no requirements.txt, Dockerfile, or detailed dependency listing with library versions is provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided in the paper."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables III, IV, and V report only point estimates (e.g., 59.9% pass@1) with no confidence intervals, error bars, or ± notation."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims 'RTL++ outperforms state-of-the-art models' by comparing raw percentages across Tables III and IV without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Tables III and IV report absolute pass@k percentages with baselines providing context. For example, CodeLlama-7B at 18.2% pass@1 vs RTL++@200K at 59.9%, and GPT-4 at 43.5% vs RTL++@100K at 54.3%. Fig. 3 shows improvement rate per +1K training samples."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for why the dataset sizes (5K to 200K) were chosen, no power analysis, and no discussion of whether the VerilogEval benchmark size is sufficient for the claims made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures reported for any experimental results. All tables show single-run point estimates."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table IV compares against seven baselines: CodeLlama-7B-Instruct, GPT-4, VeriGen, RTLCoder, BetterV, OriGen, AutoVCoder, CodeV, and CraftRTL."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include recent 2024 work: OriGen, BetterV, AutoVCoder, CodeV, and CraftRTL, all representing current state-of-the-art in RTL code generation."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table V provides an ablation study comparing RTL++ with and without textualized graph representations (TGR) at 5K dataset size, showing 1.3-5.1% improvement with TGR at different temperatures."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Evaluation uses multiple metrics: VerilogEval pass@1, pass@5, pass@10, plus RTLLM 1.1 synthesis rate and functional correctness."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of generated code quality. All evaluation is automated via VerilogEval (pass@k via simulation) and RTLLM (synthesis + functional). Human expert review could have assessed code readability, maintainability, or design quality beyond pass/fail."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "VerilogEval and RTLLM 1.1 are external benchmarks separate from the training data, serving as held-out test sets."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Results are reported only as aggregate pass@k on VerilogEval and synthesis/functional rates on RTLLM. No per-task, per-category, or per-difficulty breakdown is provided."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No error analysis, failure cases, or discussion of where RTL++ breaks down. Only the ALU case study (Section IV-E, Fig 4) shows a positive example."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "Every experiment shows consistent improvement. No mention of approaches tried and abandoned, configurations that failed, or ablations that hurt performance."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims RTL++ 'outperforms state-of-the-art models fine-tuned for RTL generation.' However, Table IV shows CraftRTL beats RTL++@200K on pass@1 (63.1 vs 59.9) and RTLLM functional (52.9 vs 51.7). RTL++ only leads on pass@5 (68.8 vs 67.8) and pass@10 (72.1 vs 69.7). The unqualified 'outperforms' claim is not fully supported."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The main causal claim — that graph-based augmentation improves performance — is supported by the Table V ablation study which isolates the graph representation variable while controlling dataset size and model. This is a controlled single-variable manipulation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'RTL Code Generation' broadly but results are only on Verilog (no VHDL, SystemVerilog). Only CodeLlama-7B is used as the base model. The paper acknowledges DeepSeek could yield 'superior outcomes' (footnote 10) but doesn't bound claims to the tested configuration."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are discussed. The improvement could be partly due to larger/different training data rather than graph representations specifically. The ablation at 5K doesn't rule out confounds at 200K scale."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures pass@k on VerilogEval and synthesis/functional rates on RTLLM, but frames results as 'advancing the capabilities of LLM-assisted RTL code generation' and 'high-quality' code generation. No discussion of the gap between benchmark pass rates and real-world RTL design quality (power, performance, area, maintainability)."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "CodeLlama-7B-Instruct is specified as the base model (specific enough). However, GPT-4 and GPT-4o are used for data generation and refinement without version numbers or snapshot dates. Section IV says 'GPT-4 has been engaged' without specifying gpt-4-0613 or similar."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section III-B describes the GPT refinement prompt in natural language ('guided GPT through several key steps') and Section III-D describes instruction generation prompting, but no actual prompt text is provided anywhere in the paper."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section IV reports: learning rate 2e-4, AdamW optimizer with β1=0.9, β2=0.99, cosine decay schedule, warmup ratio 0.03, batch size 2, 1 epoch, LoRA technique. Table V reports temperature settings (0.6, 0.7)."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. RTL++ is a fine-tuned LLM evaluated on standard benchmarks with direct generation."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section III-A documents data collection (keyword-based search, 100-300 line filtering, star-count ranking, testbench/netlist exclusion). Section III-B documents GPT-based refinement (dependency removal, variable initialization, syntax correction, Yosys synthesis validation)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper has no limitations, threats-to-validity, or discussion section. Structure is: Introduction, Related Work, Proposed Model, Experiments, Conclusion — with no limitations discussion."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity discussed anywhere in the paper."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit scope boundaries stated. The paper does not acknowledge that results are limited to Verilog only, CodeLlama-7B only, or that VerilogEval HumanEval subset may not represent all RTL design scenarios."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The training dataset is linked on HuggingFace, but raw experimental outputs (generated code samples, per-problem pass/fail results, benchmark logs) are not available for independent verification of the reported numbers."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section III-A describes the collection procedure in detail: keyword-based search from GitHub, Bitbucket, and Opencores; star-count ranking; line-count filtering (100-300 lines); testbench/netlist exclusion; hierarchical module inclusion."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data is collected from public code repositories (GitHub, Bitbucket, Opencores)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline stages are described (collection → refinement → graph generation → instruction generation → fine-tuning) but intermediate counts are missing. The paper does not state how many samples were initially collected, how many were excluded at each filtering stage, or how many failed synthesis validation."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations with University of Central Florida, Department of ECE, are clearly listed with email addresses. They are not evaluating a commercial product they are affiliated with."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence cannot be assessed. The paper mentions ~$84 per 1000 samples for GPT-4 usage, indicating non-trivial costs, but the funding source is unstated."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial disclosure statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff date is stated for CodeLlama-7B-Instruct. The custom training data is described as collected from GitHub, Bitbucket, and Opencores but no temporal cutoff is given."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of potential overlap between the 200K training samples (collected from GitHub) and the VerilogEval benchmark (also derived from GitHub/open-source Verilog). This is a significant concern since both draw from the same ecosystem."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "VerilogEval was published in 2023 and its problems are publicly available. CodeLlama was also trained on public code. No discussion of whether the benchmark solutions or problems appeared in either the base model's training data or RTL++'s custom training set."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. Evaluation is entirely automated via benchmarks."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or tokens consumed is reported for the fine-tuned model. Only data generation cost (~$84 per 1000 samples for GPT-4) is mentioned."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is mentioned (NVIDIA L4) and GPT-4 data generation cost (~$84/1K samples), but total GPU hours for fine-tuning, total training time, and total project compute cost are not stated."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds. Results appear to be from a single training run per configuration."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they are from single or multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Hyperparameters are reported but no search budget or method is described. No indication of how many configurations were tried before selecting the reported settings."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper does not explain how the final hyperparameter configuration was selected or whether a validation set was used for selection."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple comparisons are made across 9+ models and multiple metrics without any statistical tests or correction procedures."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own system against baselines without acknowledging evaluation bias. Some baselines use numbers from original papers while RTL++ numbers are self-reported."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "RTL++@200K uses 200K training samples vs CraftRTL's 80.1K and RTLCoder's 27K. The performance comparison does not normalize for dataset size or compute. Fig 3 shows data-size vs performance but doesn't compare baselines at matched compute."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether VerilogEval or RTLLM actually measure RTL code generation quality in a meaningful way. The paper uses VerilogEval's HumanEval subset based on a brief citation ([40]) but does not analyze construct validity."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No agentic scaffolding is involved. Models are evaluated via direct code generation on benchmarks."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of temporal leakage. The training data is collected from public repositories without temporal restrictions, and VerilogEval problems have been publicly available since 2023."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of feature leakage or whether the evaluation setup provides information not available in real usage."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No analysis of whether training data (from GitHub, Bitbucket, Opencores) and VerilogEval/RTLLM test problems are independent. Both draw from the open-source Verilog ecosystem, creating a serious overlap risk."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is used. No deduplication against benchmarks, no canary strings, no membership inference testing."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "RTL++ with 200K training samples achieves 59.9% pass@1, 68.8% pass@5, 72.1% pass@10 on VerilogEval HumanEval subset.",
    369       "evidence": "Table III and Table IV report these exact numbers for RTL++@200K on VerilogEval HumanEval.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "RTL++ outperforms GPT-4 on VerilogEval when trained on 100K+ dataset.",
    374       "evidence": "Table III shows RTL++@100K at 54.3% pass@1 vs GPT-4 at 43.5% pass@1. No significance testing.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "RTL++ outperforms state-of-the-art fine-tuned models for RTL generation.",
    379       "evidence": "Table IV shows RTL++@200K beats CraftRTL on pass@5 (68.8 vs 67.8) and pass@10 (72.1 vs 69.7), but loses on pass@1 (59.9 vs 63.1) and RTLLM functional correctness (51.7 vs 52.9). The claim is only partially supported.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Graph-augmented instruction generation improves fine-tuning performance by up to 5% at 5K scale and 18% at 100K scale.",
    384       "evidence": "Table V shows a 5.1% improvement at 5K (25.6 → 30.7 pass@10 at temp 0.7). The 18% improvement at 100K is stated in Section IV-D text but not shown in a table.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "Dataset size consistently improves model quality with diminishing marginal returns.",
    389       "evidence": "Table III and Fig. 3 show a clear scaling trend from 5K to 200K training samples, with diminishing improvement rate shown in the secondary axis of Fig. 3.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Abstract overclaims relative to results",
    396       "detail": "The abstract claims RTL++ 'outperforms state-of-the-art models' but Table IV shows CraftRTL beats RTL++@200K on pass@1 (63.1 vs 59.9) and RTLLM functional correctness (52.9 vs 51.7). The claim holds only for pass@5 and pass@10."
    397     },
    398     {
    399       "flag": "No error bars or variance on any results",
    400       "detail": "All results across Tables III, IV, and V are single point estimates with no indication of variance across seeds or runs. Pass@k metrics are inherently noisy and a single run may not be representative."
    401     },
    402     {
    403       "flag": "Training/test data contamination risk",
    404       "detail": "Training data collected from GitHub, Bitbucket, and Opencores. VerilogEval is also derived from open-source Verilog. No overlap analysis or deduplication is performed or discussed."
    405     },
    406     {
    407       "flag": "Unfair compute comparison",
    408       "detail": "RTL++@200K uses 200K training samples while CraftRTL uses 80.1K, RTLCoder uses 27K, and CodeV uses 165K. Performance is not normalized for training data size or compute budget, making comparisons misleading."
    409     },
    410     {
    411       "flag": "Ablation only at smallest scale",
    412       "detail": "The graph augmentation ablation (Table V) is only shown at the 5K dataset scale. The claimed 18% improvement at 100K is mentioned in text without supporting table data, making the key contribution's impact at scale unverifiable."
    413     },
    414     {
    415       "flag": "No limitations section",
    416       "detail": "The paper entirely lacks a limitations or threats-to-validity section despite significant limitations: single base model, Verilog only, no variance reporting, potential data overlap."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    422       "authors": ["S. Thakur"],
    423       "year": 2023,
    424       "arxiv_id": "2308.00708",
    425       "relevance": "Early fine-tuning approach for Verilog code generation from GitHub and textbook data, key baseline."
    426     },
    427     {
    428       "title": "RTLCoder: Outperforming GPT-3.5 in Design RTL Generation with Our Open-Source Dataset and Lightweight Solution",
    429       "authors": ["S. Liu"],
    430       "year": 2024,
    431       "relevance": "GPT-3.5-based synthetic data approach for RTL code generation, competing method with 27K training pairs."
    432     },
    433     {
    434       "title": "OriGen: Enhancing RTL Code Generation with Code-to-Code Augmentation and Self-Reflection",
    435       "authors": ["F. Cui"],
    436       "year": 2024,
    437       "arxiv_id": "2407.16237",
    438       "relevance": "Code-to-code augmentation and self-reflection for Verilog generation, strong baseline with 222K training samples."
    439     },
    440     {
    441       "title": "CraftRTL: High-quality Synthetic Data Generation for Verilog Code Models with Correct-by-Construction Non-Textual Representations and Targeted Code Repair",
    442       "authors": ["M. Liu"],
    443       "year": 2024,
    444       "arxiv_id": "2409.12993",
    445       "relevance": "Correct-by-construction data approach for RTL LLMs, strongest baseline in the comparison (63.1% pass@1)."
    446     },
    447     {
    448       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    449       "authors": ["M. Liu"],
    450       "year": 2023,
    451       "relevance": "Primary benchmark used for evaluation, standard pass@k metric for Verilog code generation."
    452     },
    453     {
    454       "title": "AutoVCoder: A Systematic Framework for Automated Verilog Code Generation Using LLMs",
    455       "authors": ["M. Gao"],
    456       "year": 2024,
    457       "relevance": "Domain-specific RAG approach for Verilog generation, competing method incorporating retrieval augmentation."
    458     },
    459     {
    460       "title": "CodeV: Empowering LLMs for Verilog Generation through Multi-Level Summarization",
    461       "authors": ["Y. Zhao"],
    462       "year": 2024,
    463       "arxiv_id": "2407.10424",
    464       "relevance": "Multi-level code summarization approach for Verilog, shifts focus from generation to description-code pairs."
    465     },
    466     {
    467       "title": "Code Llama: Open Foundation Models for Code",
    468       "authors": ["B. Rozière"],
    469       "year": 2024,
    470       "arxiv_id": "2308.12950",
    471       "relevance": "Base model used for RTL++ fine-tuning, widely used open-source code LLM."
    472     },
    473     {
    474       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming – The Rise of Code Intelligence",
    475       "authors": ["D. Guo"],
    476       "year": 2024,
    477       "relevance": "Alternative base model for code generation; several baselines in Table IV use DeepSeek-Coder variants."
    478     },
    479     {
    480       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    481       "authors": ["E. J. Hu"],
    482       "year": 2021,
    483       "arxiv_id": "2106.09685",
    484       "relevance": "Parameter-efficient fine-tuning technique used by RTL++ for adapting CodeLlama to RTL generation."
    485     },
    486     {
    487       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    488       "authors": ["Y. Lu"],
    489       "year": 2023,
    490       "arxiv_id": "2308.05345",
    491       "relevance": "Secondary benchmark used for evaluation, measuring synthesis rate and functional correctness of generated RTL."
    492     },
    493     {
    494       "title": "Evaluating Large Language Models Trained on Code",
    495       "authors": ["M. Chen"],
    496       "year": 2021,
    497       "arxiv_id": "2107.03374",
    498       "relevance": "Codex paper establishing pass@k evaluation methodology widely used in code generation research."
    499     },
    500     {
    501       "title": "BetterV: Controlled Verilog Generation with Discriminative Guidance",
    502       "authors": ["Z. Pei"],
    503       "year": 2024,
    504       "arxiv_id": "2402.03375",
    505       "relevance": "Controlled text generation framework for Verilog with PPA optimization, competing approach."
    506     }
    507   ]
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs