scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31720B)
      1 {
      2   "paper": {
      3     "title": "Exploring Code Language Models for Automated HLS-based Hardware Generation: Benchmark, Infrastructure and Analysis",
      4     "authors": [
      5       "Jiahao Gai",
      6       "Hao (Mark) Chen",
      7       "Zhican Wang",
      8       "Hongyu Zhou",
      9       "Wanru Zhao",
     10       "Nicholas Lane",
     11       "Hongxiang Fan"
     12     ],
     13     "year": 2025,
     14     "venue": "ASP-DAC'25 (Asia and South Pacific Design Automation Conference)",
     15     "arxiv_id": "2502.13921",
     16     "doi": "10.1145/3658617.3697616"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL is provided for the framework, fine-tuned models, or evaluation infrastructure. The paper references open-source tools used (axolotl, Code-Llama) but does not release its own code."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The collected dataset of 42,000+ HLS programs is not released. The paper describes collecting from HLSyn and ML4Accel repositories but does not provide a download link for the curated and filtered dataset or the test set with unit tests."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section 5.1 lists hardware (4x NVIDIA L20 GPUs, 80 vCPU Intel Xeon Platinum 8457C, 100GB RAM) and some training settings (8-bit loading, sequence length 4096), but no requirements.txt, Dockerfile, or complete library version list is provided. Software versions are incomplete (Vivado 2020.1 is mentioned but Python version, PyTorch version, axolotl version, etc. are not)."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The paper describes the framework at a high level but does not include commands, scripts, or a README for replication."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results are reported as single point estimates (e.g., 88.44% syntax, 53.20% functionality in Figure 6). No confidence intervals or error bars appear in any figure or table."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims improvements from finetuning, CoT, and feedback loops based solely on comparing raw percentages. No statistical significance tests (t-tests, bootstrap, etc.) are applied to any comparison."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Results are reported with both baseline and improved values, providing context for the magnitude of effects. For example, Section 5.2: syntax from 54.85% to 88.44%, functionality from 0% to 53.20%. Section 5.3: syntax from 88.44% to 94.33%, functionality from 53.20% to 61.45%."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The test set derives from a 4:1 split of only 52 base designs (Section 3.2), yielding approximately 10 test designs. No justification is given for why this sample size is sufficient. No power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviations, variance, or spread measures are reported for any experiment. All results appear to be from single runs without repeated trials."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The non-finetuned Code-Llama-7B serves as a baseline (Section 5.2, Figure 6a). Ablations of CoT and feedback loops provide additional internal baselines."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "All comparisons are against ablations of the authors' own system. No external baselines are included — no comparison with VeriGen, VerilogEval methods, RTLFixer, LLM-VeriPPA, or other contemporary LLM-for-hardware approaches discussed in Section 2.2."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Sections 5.2–5.4 systematically ablate three components: supervised fine-tuning (Section 5.2), chain-of-thought prompting (Section 5.3), and feedback loops with varying iterations (Section 5.4)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two metrics are used throughout: pass@3 syntax correctness and pass@3 functionality correctness. Table 1 also reports latency and resource usage (LUTs, registers, DSP48s, BRAMs) for synthesized designs."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of generated code quality. All evaluation is automated through GCC syntax checking and unit test comparison."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 3.2 states the 52 designs are 'split into training and testing sets at a 4:1 ratio.' Results are reported on this test split."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 2 (Section 5.6) breaks down performance by complexity level (easy, medium, difficult). Table 3 (Section 5.7) compares MachineGen vs HumanRefine. Section 3.2 categorizes designs into five types."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper discusses performance trends (e.g., decline with complexity, HumanRefine gap) but does not show specific examples of failed generations, common error types, or qualitative analysis of what goes wrong."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 3 reports the dramatic HumanRefine performance drop (syntax 47.29%, functionality 21.36% vs MachineGen's 93.83%, 62.24%). Section 5.4 reports diminishing returns for the second feedback loop iteration. Section 5.8 acknowledges benchmark diversity limitations."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims 'Comprehensive experiments demonstrate the effectiveness of our methods' and the results in Sections 5.2–5.4 do show improvements from finetuning, CoT, and feedback loops. The abstract is relatively hedged, noting limitations and future work."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims (e.g., 'finetuning dramatically increases syntax correctness,' 'incorporating CoT leads to a noticeable improvement') are backed by controlled ablation studies that vary one component at a time (Sections 5.2–5.4). The ablation design is adequate for these claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'Automated HLS-based Hardware Generation' broadly, but experiments use only one model (Code-Llama-7B), one fine-tuning method (QLoRA), and 52 base designs from two repositories. Section 5.8 acknowledges 'the diversity of hardware designs in the benchmark is limited' but the title and abstract do not bound claims to the tested setting."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Section 5.7 offers one interpretation for the MachineGen/HumanRefine gap (training data bias, complexity of human prompts) but does not consider alternatives. No discussion of confounds such as whether improvements from CoT are due to longer input sequences, or whether the non-finetuned model's 0% functionality could reflect prompt format mismatch rather than lack of HLS knowledge."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures pass@3 for syntax correctness and functionality correctness, and claims are framed in terms of these specific metrics. No broader claims about 'hardware design quality' beyond what is measured, though Section 5.8 notes hardware performance feedback is left for future work."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper specifies 'Code-Llama-7B' (Section 5.1) and 'ChatGPT (version 3.5 and 4)' (Section 3.3) but provides no specific checkpoint, snapshot date, or API version for any model. Code-Llama-7B has multiple releases, and 'ChatGPT 3.5 and 4' are marketing names without version granularity."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The default instruction prompt is stated as 'Generate HLS code with the following instructions:' (Section 3.1). Figure 5 provides the full CoT prompt text ('Let's think step by step. First, Consider the characteristics of FPGA...'). These are the actual prompts used, not just descriptions."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Section 5.1 reports some settings: 8-bit loading, sequence length 4096, warmup steps 100, gradient accumulation 4, micro-batch size 4, inference batch size 2. However, critical hyperparameters are missing: learning rate, number of epochs, temperature and sampling settings for generation (top-p, top-k)."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The iterative framework is described in detail: Figure 4 provides a full overview, Section 4.1 describes the two-stage pipeline, Section 4.2 details the CoT mechanism, and Section 4.3 describes the two-step feedback loop (syntax check → feedback → functionality check → feedback) with iteration control."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "Section 3.2 states designs were collected from HLSyn and ML4Accel, then 'We filter out the HLS programs that are invalid, resulting in a collection of over 42,000 HLS programs.' The filtering criteria for 'invalid' programs are not specified, and the count before filtering is not stated."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5.8 'Thoughts, Insights, and Limitations' contains a dedicated subsection (item 3) discussing limitations including unavailability of advanced reasoning models, limited benchmark diversity, and missing test-time scaling approaches."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5.8 item 3 discusses specific threats: DeepSeek-R1 was not available for evaluation, test-time scaling could improve results, and 'the diversity of hardware designs in the benchmark is limited, which may impact the generalizability of our findings.' These are specific to this study."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper explicitly states it focuses on C-based HLS (footnote Section 2), acknowledges the benchmark's limited diversity may impact generalizability (Section 5.8), and identifies hardware performance feedback as outside current scope. The abstract notes limitations due to 'the timeframe of this research.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Neither the collected HLS dataset, the generated design descriptions, the fine-tuned model weights, nor the test outputs are made available for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.2 describes the data sources (HLSyn, ML4Accel open-source repositories), the five design categories (matrix operations, scientific simulations, statistical computations, iterative methods, other kernels), the filtering of invalid programs, and the 4:1 train/test split."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data comes from identified open-source repositories (HLSyn, ML4Accel)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline goes from 52 base designs → different pragma combinations → filtering invalid programs → 42,000+ programs → 4:1 split. However, the number of programs before filtering is not stated, the criteria for 'invalid' are unspecified, and the exact test set size is not given."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source, acknowledgments section, or grant information appears anywhere in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: Imperial College London, University of Cambridge, Shanghai Jiao Tong University, University of Sydney. All are academic institutions with no apparent product conflict."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, making it impossible to assess funder independence."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper fine-tunes and evaluates Code-Llama-7B on an HLS benchmark but never states the model's pre-training data cutoff date. This is relevant because the HLS designs come from public GitHub repositories that may be in the pre-training corpus."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether the HLS designs from HLSyn and ML4Accel (both on GitHub) could have appeared in Code-Llama's pre-training data. The train/test split is described but potential overlap with pre-training data is not addressed."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The benchmark designs are sourced from public GitHub repositories (HLSyn, ML4Accel) that existed before Code-Llama's training. Code-Llama was trained on GitHub data, creating a clear contamination risk that is never discussed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Figure 9 reports inference time averaged over 120 data points under different conditions (w/o feedback, syntax loop, functionality loop, each with and without CoT). Time cost ranges from 5–11 seconds per data point."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Section 5.1 describes the hardware setup (4x NVIDIA L20 GPUs, 80 vCPU, 100GB RAM) but does not state total GPU hours for fine-tuning, total training time, or overall computational budget."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single training run and single evaluation."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper uses pass@3 (3 generated samples per problem) but does not state how many independent runs of the entire experiment were conducted. It is unclear whether results are from a single run or averaged across multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 5.1 lists fixed hyperparameter values but does not describe any hyperparameter search process, number of configurations tried, or how the reported values were selected."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper reports results for specific configurations without explaining how they were selected. No validation-based selection process is described."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed at all, so the question of correcting for multiple comparisons is structurally inapplicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "All comparisons are between the authors' own system variants. No external baselines are included and no discussion of author-evaluation bias is present."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Figure 9 shows time cost for different configurations, but performance is not plotted as a function of compute budget. The time and accuracy results are presented separately, without matched-compute analysis."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether pass@3 syntax correctness and unit-test-based functionality adequately measure the ability to generate useful hardware designs. No discussion of construct validity or comparison with alternative evaluation approaches."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "The framework (scaffold) IS the thing being evaluated and ablated. The paper tests one model (Code-Llama-7B) within its own scaffold, varying scaffold components as independent variables. No cross-model-cross-scaffold comparison is made."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The HLS designs sourced from HLSyn and ML4Accel were publicly available on GitHub before Code-Llama's pre-training. No discussion of whether the model could have memorized these designs during pre-training."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup provides any information leakage. The MachineGen prompts are generated by ChatGPT from the reference designs — the degree of information transferred from reference to prompt is not analyzed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Training and test sets are derived from the same 52 base designs with different pragma combinations (PIPELINE, PARALLEL, TILE). This creates high structural similarity between train and test examples. This non-independence is not discussed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram analysis, or decontamination pipeline is mentioned."
    363       }
    364     }
    365   },
    366   "scan_version": 3,
    367   "active_modules": ["experimental_rigor", "data_leakage"],
    368   "claims": [
    369     {
    370       "claim": "Fine-tuning Code-Llama-7B dramatically increases syntax correctness from 54.85% to 88.44% and functionality from 0% to 53.20%",
    371       "evidence": "Figure 6(a) and Section 5.2 show pass@3 results comparing non-finetuned vs finetuned models on the MachineGen test set.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Chain-of-thought prompting improves syntax correctness from 88.44% to 94.33% and functionality from 53.20% to 61.45%",
    376       "evidence": "Figure 6(b) and Section 5.3 compare results with and without CoT on the finetuned model.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Two-step feedback loops significantly improve syntax and functionality, with diminishing returns after the first iteration",
    381       "evidence": "Figures 7 and 8 in Section 5.4 show pass@3 results across 0, 1, and 2 feedback loop iterations for both syntax and functionality feedback.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "HLS-based designs require approximately 3–4x fewer tokens than equivalent HDL implementations",
    386       "evidence": "Figure 2 shows a token comparison between HLS and Verilog for an identical 16-bit multiplier design, with a normalized bar chart.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "Performance degrades with increasing code complexity (easy: 63.33% → difficult: 53.33% functionality)",
    391       "evidence": "Table 2 in Section 5.6 shows functionality declining from 63.33% (easy) to 53.33% (difficult), while syntax drops from 96.67% to 90%.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "The model performs significantly worse on human-refined prompts (47.29% syntax, 21.36% functionality) compared to machine-generated prompts (93.83%, 62.24%)",
    396       "evidence": "Table 3 in Section 5.7 compares MachineGen and HumanRefine test sets.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "methodology_tags": ["benchmark-eval"],
    401   "key_findings": "Fine-tuning Code-Llama-7B on a collected HLS dataset of 42,000+ programs dramatically improves syntax correctness (54.85%→88.44%) and functionality (0%→53.20%) compared to the base model. Chain-of-thought prompting and iterative feedback loops provide further incremental gains. However, performance drops sharply on human-refined prompts (syntax: 93.83%→47.29%), suggesting the model overfits to machine-generated prompt patterns and may not generalize to real user instructions.",
    402   "red_flags": [
    403     {
    404       "flag": "Very small and non-diverse test set",
    405       "detail": "Only 52 base designs from two repositories, split 4:1, yielding ~10 test designs. The 42,000+ figure comes from pragma combinations of these same 52 designs, meaning extremely limited design diversity. Claims about 'automated hardware generation' rest on a tiny sample of hardware types."
    406     },
    407     {
    408       "flag": "No external baselines",
    409       "detail": "All comparisons are between ablations of the authors' own system. Despite discussing VeriGen, VerilogEval, RTLFixer, and LLM-VeriPPA in the related work, none are compared against. It is impossible to assess whether this approach is competitive with existing methods."
    410     },
    411     {
    412       "flag": "Unaddressed contamination risk",
    413       "detail": "Benchmark HLS designs are sourced from public GitHub repositories (HLSyn, ML4Accel). Code-Llama was pre-trained on GitHub data. The non-finetuned model's 54.85% syntax rate could partly reflect pre-training memorization. This is never discussed."
    414     },
    415     {
    416       "flag": "Train-test non-independence",
    417       "detail": "Training and test sets are derived from the same 52 base designs with different pragma configurations. A model could learn the base design patterns from training and generalize to test by recognizing the same structure with different pragmas, inflating apparent generalization."
    418     },
    419     {
    420       "flag": "No error bars or repeated trials",
    421       "detail": "All results are single-run point estimates without variance, confidence intervals, or repeated trials. Given the small test set size, random variation could substantially affect reported percentages."
    422     },
    423     {
    424       "flag": "MachineGen/HumanRefine gap undermines practical claims",
    425       "detail": "The dramatic performance collapse on human-refined prompts (93.83%→47.29% syntax) suggests the model is pattern-matching machine-generated prompt structure rather than understanding hardware design. This severely limits practical applicability but is not framed as a central finding."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Evaluating large language models trained on code",
    431       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    432       "year": 2021,
    433       "arxiv_id": "2107.03374",
    434       "relevance": "Introduced HumanEval benchmark and Codex model — foundational work for LLM code generation evaluation that this paper's methodology builds upon."
    435     },
    436     {
    437       "title": "VerilogEval: Evaluating large language models for Verilog code generation",
    438       "authors": ["Mingjie Liu", "Nathaniel Pinckney", "Brucek Khailany", "Haoxing Ren"],
    439       "year": 2023,
    440       "relevance": "LLM benchmark for hardware description language generation that directly motivates this paper's HLS alternative approach."
    441     },
    442     {
    443       "title": "RTLLM: An open-source benchmark for design RTL generation with large language model",
    444       "authors": ["Yao Lu", "Shang Liu", "Qijun Zhang", "Zhiyao Xie"],
    445       "year": 2024,
    446       "relevance": "Open-source RTL generation benchmark used for comparison of data availability between HDL and software language datasets."
    447     },
    448     {
    449       "title": "Verigen: A large language model for Verilog code generation",
    450       "authors": ["Shailja Thakur", "Baleegh Ahmad", "Hammond Pearce"],
    451       "year": 2023,
    452       "relevance": "LLM fine-tuned for Verilog generation — a direct comparison point for LLM-assisted hardware design approaches."
    453     },
    454     {
    455       "title": "RTLFixer: Automatically fixing RTL syntax errors with large language models",
    456       "authors": ["YunDa Tsai", "Mingjie Liu", "Haoxing Ren"],
    457       "year": 2023,
    458       "arxiv_id": "2311.16543",
    459       "relevance": "Automated LLM-assisted debugging framework for RTL designs using RAG and ReAct prompting — related feedback-loop approach for hardware code."
    460     },
    461     {
    462       "title": "Starcoder: may the source be with you!",
    463       "authors": ["Raymond Li", "Loubna Ben Allal"],
    464       "year": 2023,
    465       "arxiv_id": "2305.06161",
    466       "relevance": "Large-scale code dataset and model showing HDL data scarcity relative to software languages — motivates the HLS approach."
    467     },
    468     {
    469       "title": "Code llama: Open foundation models for code",
    470       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    471       "year": 2023,
    472       "arxiv_id": "2308.12950",
    473       "relevance": "The base model (Code-Llama-7B) fine-tuned in this paper's experiments."
    474     },
    475     {
    476       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    477       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    478       "year": 2022,
    479       "relevance": "Foundational CoT prompting technique applied and evaluated in this paper for HLS code generation."
    480     },
    481     {
    482       "title": "RLTF: Reinforcement learning from unit test feedback",
    483       "authors": ["Jiate Liu", "Yiqin Zhu", "Kaiwen Xiao"],
    484       "year": 2023,
    485       "arxiv_id": "2307.04349",
    486       "relevance": "Uses unit test feedback for code generation improvement — related feedback loop methodology applied to software code."
    487     },
    488     {
    489       "title": "LLM-VeriPPA: Power, Performance, and Area-aware Verilog Code Generation and Refinement with Large Language Models",
    490       "authors": ["Anonymous Authors"],
    491       "year": 2024,
    492       "relevance": "Two-stage RTL refinement process for syntax, functionality, and hardware performance — closest related work for iterative LLM hardware generation."
    493     },
    494     {
    495       "title": "Structured chain-of-thought prompting for code generation",
    496       "authors": ["Jia Li", "Ge Li", "Yongmin Li", "Zhi Jin"],
    497       "year": 2023,
    498       "arxiv_id": "2305.06599",
    499       "relevance": "Structured CoT for code generation showing specialized prompting needed for software code — contrasted with this paper's finding that simple CoT works for HLS."
    500     },
    501     {
    502       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    503       "authors": ["Daya Guo"],
    504       "year": 2025,
    505       "arxiv_id": "2501.12948",
    506       "relevance": "Advanced reasoning model mentioned as unavailable during this research — acknowledged limitation for future evaluation."
    507     }
    508   ],
    509   "engagement_factors": {
    510     "practical_relevance": {
    511       "score": 1,
    512       "justification": "HLS code generation is a niche hardware engineering application; the framework is not released, limiting immediate practitioner use."
    513     },
    514     "surprise_contrarian": {
    515       "score": 1,
    516       "justification": "The HLS-over-HDL argument for LLM generation is reasonable but not surprising; the MachineGen/HumanRefine gap is the most unexpected finding."
    517     },
    518     "fear_safety": {
    519       "score": 0,
    520       "justification": "No safety, security, or risk implications are raised by LLM-assisted HLS code generation."
    521     },
    522     "drama_conflict": {
    523       "score": 0,
    524       "justification": "No controversial claims or challenges to existing work; a straightforward empirical study."
    525     },
    526     "demo_ability": {
    527       "score": 0,
    528       "justification": "No code, data, or demo released; the framework cannot be tried."
    529     },
    530     "brand_recognition": {
    531       "score": 1,
    532       "justification": "Cambridge and Imperial College London are well-known universities but not prominent AI labs in the LLM space."
    533     }
    534   }
    535 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs