scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32294B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Code Language Models for Automated HLS-based Hardware Generation: Benchmark, Infrastructure and Analysis",
      6     "authors": [
      7       "Jiahao Gai",
      8       "Hao (Mark) Chen",
      9       "Zhican Wang",
     10       "Hongyu Zhou",
     11       "Wanru Zhao",
     12       "Nicholas Lane",
     13       "Hongxiang Fan"
     14     ],
     15     "year": 2025,
     16     "venue": "Asia and South Pacific Design Automation Conference (ASP-DAC)",
     17     "arxiv_id": "2502.13921",
     18     "doi": "10.1145/3658617.3697616"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract claims 'Comprehensive experiments demonstrate the effectiveness of our methods' and the results in Sections 5.2–5.4 do show improvements from finetuning, CoT, and feedback loops. The abstract is relatively hedged, noting limitations and future work.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims (e.g., 'finetuning dramatically increases syntax correctness,' 'incorporating CoT leads to a noticeable improvement') are backed by controlled ablation studies that vary one component at a time (Sections 5.2–5.4). The ablation design is adequate for these claims.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The title claims 'Automated HLS-based Hardware Generation' broadly, but experiments use only one model (Code-Llama-7B), one fine-tuning method (QLoRA), and 52 base designs from two repositories. Section 5.8 acknowledges 'the diversity of hardware designs in the benchmark is limited' but the title and abstract do not bound claims to the tested setting.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Section 5.7 offers one interpretation for the MachineGen/HumanRefine gap (training data bias, complexity of human prompts) but does not consider alternatives. No discussion of confounds such as whether improvements from CoT are due to longer input sequences, or whether the non-finetuned model's 0% functionality could reflect prompt format mismatch rather than lack of HLS knowledge.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper measures pass@3 for syntax correctness and functionality correctness, and claims are framed in terms of these specific metrics. No broader claims about 'hardware design quality' beyond what is measured, though Section 5.8 notes hardware performance feedback is left for future work.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 5.8 'Thoughts, Insights, and Limitations' contains a dedicated subsection (item 3) discussing limitations including unavailability of advanced reasoning models, limited benchmark diversity, and missing test-time scaling approaches.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 5.8 item 3 discusses specific threats: DeepSeek-R1 was not available for evaluation, test-time scaling could improve results, and 'the diversity of hardware designs in the benchmark is limited, which may impact the generalizability of our findings.' These are specific to this study.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper explicitly states it focuses on C-based HLS (footnote Section 2), acknowledges the benchmark's limited diversity may impact generalizability (Section 5.8), and identifies hardware performance feedback as outside current scope. The abstract notes limitations due to 'the timeframe of this research.'",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source, acknowledgments section, or grant information appears anywhere in the paper.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed: Imperial College London, University of Cambridge, Shanghai Jiao Tong University, University of Sydney. All are academic institutions with no apparent product conflict.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funding is disclosed, making it impossible to assess funder independence.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is included in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Core terms defined: HLS/HDL in intro, pass@k metric in Section 2.2, syntax/functionality operationalized in Section 3.4; some terms (e.g., 'quality') used somewhat loosely.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three contributions explicitly stated in introduction: dataset (40K+ programs), framework for HLS generation, and optimization techniques (CoT/feedback).",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Sections 2.1 and 2.2 review LLM code generation and hardware generation literature; paper positions itself as first to systematically explore HLS as alternative to low-level HDLs.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No repository URL is provided for the framework, fine-tuned models, or evaluation infrastructure. The paper references open-source tools used (axolotl, Code-Llama) but does not release its own code.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "The collected dataset of 42,000+ HLS programs is not released. The paper describes collecting from HLSyn and ML4Accel repositories but does not provide a download link for the curated and filtered dataset or the test set with unit tests.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Section 5.1 lists hardware (4x NVIDIA L20 GPUs, 80 vCPU Intel Xeon Platinum 8457C, 100GB RAM) and some training settings (8-bit loading, sequence length 4096), but no requirements.txt, Dockerfile, or complete library version list is provided. Software versions are incomplete (Vivado 2020.1 is mentioned but Python version, PyTorch version, axolotl version, etc. are not).",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are provided. The paper describes the framework at a high level but does not include commands, scripts, or a README for replication.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All results are reported as single point estimates (e.g., 88.44% syntax, 53.20% functionality in Figure 6). No confidence intervals or error bars appear in any figure or table.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "The paper claims improvements from finetuning, CoT, and feedback loops based solely on comparing raw percentages. No statistical significance tests (t-tests, bootstrap, etc.) are applied to any comparison.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Results are reported with both baseline and improved values, providing context for the magnitude of effects. For example, Section 5.2: syntax from 54.85% to 88.44%, functionality from 0% to 53.20%. Section 5.3: syntax from 88.44% to 94.33%, functionality from 53.20% to 61.45%.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The test set derives from a 4:1 split of only 52 base designs (Section 3.2), yielding approximately 10 test designs. No justification is given for why this sample size is sufficient. No power analysis is discussed.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "No standard deviations, variance, or spread measures are reported for any experiment. All results appear to be from single runs without repeated trials.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "The non-finetuned Code-Llama-7B serves as a baseline (Section 5.2, Figure 6a). Ablations of CoT and feedback loops provide additional internal baselines.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "All comparisons are against ablations of the authors' own system. No external baselines are included — no comparison with VeriGen, VerilogEval methods, RTLFixer, LLM-VeriPPA, or other contemporary LLM-for-hardware approaches discussed in Section 2.2.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Sections 5.2–5.4 systematically ablate three components: supervised fine-tuning (Section 5.2), chain-of-thought prompting (Section 5.3), and feedback loops with varying iterations (Section 5.4).",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Two metrics are used throughout: pass@3 syntax correctness and pass@3 functionality correctness. Table 1 also reports latency and resource usage (LUTs, registers, DSP48s, BRAMs) for synthesized designs.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "No human evaluation of generated code quality. All evaluation is automated through GCC syntax checking and unit test comparison.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Section 3.2 states the 52 designs are 'split into training and testing sets at a 4:1 ratio.' Results are reported on this test split.",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Table 2 (Section 5.6) breaks down performance by complexity level (easy, medium, difficult). Table 3 (Section 5.7) compares MachineGen vs HumanRefine. Section 3.2 categorizes designs into five types.",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "The paper discusses performance trends (e.g., decline with complexity, HumanRefine gap) but does not show specific examples of failed generations, common error types, or qualitative analysis of what goes wrong.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Table 3 reports the dramatic HumanRefine performance drop (syntax 47.29%, functionality 21.36% vs MachineGen's 93.83%, 62.24%). Section 5.4 reports diminishing returns for the second feedback loop iteration. Section 5.8 acknowledges benchmark diversity limitations.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "The paper specifies 'Code-Llama-7B' (Section 5.1) and 'ChatGPT (version 3.5 and 4)' (Section 3.3) but provides no specific checkpoint, snapshot date, or API version for any model. Code-Llama-7B has multiple releases, and 'ChatGPT 3.5 and 4' are marketing names without version granularity.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "The default instruction prompt is stated as 'Generate HLS code with the following instructions:' (Section 3.1). Figure 5 provides the full CoT prompt text ('Let's think step by step. First, Consider the characteristics of FPGA...'). These are the actual prompts used, not just descriptions.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Section 5.1 reports some settings: 8-bit loading, sequence length 4096, warmup steps 100, gradient accumulation 4, micro-batch size 4, inference batch size 2. However, critical hyperparameters are missing: learning rate, number of epochs, temperature and sampling settings for generation (top-p, top-k).",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "The iterative framework is described in detail: Figure 4 provides a full overview, Section 4.1 describes the two-stage pipeline, Section 4.2 details the CoT mechanism, and Section 4.3 describes the two-step feedback loop (syntax check → feedback → functionality check → feedback) with iteration control.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "Section 3.2 states designs were collected from HLSyn and ML4Accel, then 'We filter out the HLS programs that are invalid, resulting in a collection of over 42,000 HLS programs.' The filtering criteria for 'invalid' programs are not specified, and the count before filtering is not stated.",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Neither the collected HLS dataset, the generated design descriptions, the fine-tuned model weights, nor the test outputs are made available for independent verification.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 3.2 describes the data sources (HLSyn, ML4Accel open-source repositories), the five design categories (matrix operations, scientific simulations, statistical computations, iterative methods, other kernels), the filtering of invalid programs, and the 4:1 train/test split.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. Data comes from identified open-source repositories (HLSyn, ML4Accel).",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "The pipeline goes from 52 base designs → different pragma combinations → filtering invalid programs → 42,000+ programs → 4:1 split. However, the number of programs before filtering is not stated, the criteria for 'invalid' are unspecified, and the exact test set size is not given.",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The paper fine-tunes and evaluates Code-Llama-7B on an HLS benchmark but never states the model's pre-training data cutoff date. This is relevant because the HLS designs come from public GitHub repositories that may be in the pre-training corpus.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether the HLS designs from HLSyn and ML4Accel (both on GitHub) could have appeared in Code-Llama's pre-training data. The train/test split is described but potential overlap with pre-training data is not addressed.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "The benchmark designs are sourced from public GitHub repositories (HLSyn, ML4Accel) that existed before Code-Llama's training. Code-Llama was trained on GitHub data, creating a clear contamination risk that is never discussed.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants. This is a benchmark evaluation of LLM-generated HLS code.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Figure 9 reports inference time averaged over 120 data points under different conditions (w/o feedback, syntax loop, functionality loop, each with and without CoT). Time cost ranges from 5–11 seconds per data point.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Section 5.1 describes the hardware setup (4x NVIDIA L20 GPUs, 80 vCPU, 100GB RAM) but does not state total GPU hours for fine-tuning, total training time, or overall computational budget.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single training run and single evaluation.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "The paper uses pass@3 (3 generated samples per problem) but does not state how many independent runs of the entire experiment were conducted. It is unclear whether results are from a single run or averaged across multiple runs.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "Section 5.1 lists fixed hyperparameter values but does not describe any hyperparameter search process, number of configurations tried, or how the reported values were selected.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "The paper reports results for specific configurations without explaining how they were selected. No validation-based selection process is described.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "No statistical tests are performed at all, so the question of correcting for multiple comparisons is structurally inapplicable.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "All comparisons are between the authors' own system variants. No external baselines are included and no discussion of author-evaluation bias is present.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "Figure 9 shows time cost for different configurations, but performance is not plotted as a function of compute budget. The time and accuracy results are presented separately, without matched-compute analysis.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "The paper does not discuss whether pass@3 syntax correctness and unit-test-based functionality adequately measure the ability to generate useful hardware designs. No discussion of construct validity or comparison with alternative evaluation approaches.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": false,
    423           "answer": false,
    424           "justification": "The framework (scaffold) IS the thing being evaluated and ablated. The paper tests one model (Code-Llama-7B) within its own scaffold, varying scaffold components as independent variables. No cross-model-cross-scaffold comparison is made.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "The HLS designs sourced from HLSyn and ML4Accel were publicly available on GitHub before Code-Llama's pre-training. No discussion of whether the model could have memorized these designs during pre-training.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether the evaluation setup provides any information leakage. The MachineGen prompts are generated by ChatGPT from the reference designs — the degree of information transferred from reference to prompt is not analyzed.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "Training and test sets are derived from the same 52 base designs with different pragma combinations (PIPELINE, PARALLEL, TILE). This creates high structural similarity between train and test examples. This non-independence is not discussed.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram analysis, or decontamination pipeline is mentioned.",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "Fine-tuning pre-trained code LLMs on HLS datasets dramatically improves syntax and functionality correctness.",
    459       "evidence": "Section 5.2, Figure 6a: syntax 54.85% → 88.44%, functionality 0% → 53.20% with finetuning.",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "Chain-of-thought prompting improves HLS generation quality.",
    464       "evidence": "Section 5.3, Figure 6b: syntax 88.44% → 94.33%, functionality 53.20% → 61.45% with CoT.",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Two-step feedback loops (syntax + functionality) provide further improvements with diminishing returns.",
    469       "evidence": "Sections 5.4, Figures 7–8: both metrics improve after 1st loop, negligible gains after 2nd loop.",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "HLS is more token-efficient than Verilog for LLM-based hardware generation.",
    474       "evidence": "Figure 2: HLS requires 3–4× fewer tokens than Verilog-based designs. No direct LLM comparison provided.",
    475       "supported": "moderate"
    476     },
    477     {
    478       "claim": "The model shows dramatic performance collapse on human-refined prompts vs machine-generated ones.",
    479       "evidence": "Table 3: MachineGen 93.83% syntax vs HumanRefine 47.29%, functionality 62.24% vs 21.36%.",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "Hardware design complexity negatively impacts generation quality.",
    484       "evidence": "Table 2: easy/medium 96.67% syntax vs difficult 90%; easy 63.33% functionality, medium/difficult 53.33%.",
    485       "supported": "strong"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "ablation study"
    491   ],
    492   "key_findings": "Fine-tuned language models can generate syntactically and functionally correct HLS code, achieving 94.33% syntax and 61.45% functionality with chain-of-thought prompting and feedback refinement on machine-generated prompts. However, the approach shows severe degradation on human-refined prompts (47.29% syntax, 21.36% functionality), indicating overfitting to machine-generated prompt patterns rather than genuine understanding of hardware design. Iterative feedback provides substantial initial improvements but exhibits diminishing returns after two iterations. Hardware design complexity correlates negatively with generation success.",
    493   "red_flags": [
    494     {
    495       "flag": "Train/test contamination",
    496       "detail": "Same 52 base hardware designs appear in both training and test sets (as different pragma variants). This risks overfitting to specific design patterns and inflates performance metrics."
    497     },
    498     {
    499       "flag": "No statistical significance testing",
    500       "detail": "All results reported as point estimates without confidence intervals, error bars, or p-values. Unclear whether observed improvements are statistically significant."
    501     },
    502     {
    503       "flag": "Small, non-diverse test set",
    504       "detail": "Only 52 base designs across 5 categories. Authors acknowledge in Section 5.8.3 that limited diversity 'may impact the generalizability of our findings.'"
    505     },
    506     {
    507       "flag": "Severe model sensitivity to prompt format",
    508       "detail": "93.83% syntax on machine-generated prompts vs 47.29% on human-refined—a 46pp collapse—suggests the model exploits statistical regularities in synthetic prompts rather than learning transferable design patterns."
    509     },
    510     {
    511       "flag": "No experimental comparison to related work",
    512       "detail": "RTLLM, VerilogEval, RTLFixer, and LLM-VeriPPA mentioned in related work but no side-by-side benchmarking. Unclear how HLS generation compares empirically."
    513     },
    514     {
    515       "flag": "Incomplete cost and practicality analysis",
    516       "detail": "Inference time reported (Figure 9) but total training time, GPU-hours, token count per example, and monetary cost not quantified. Practicality for real-world use unclear."
    517     },
    518     {
    519       "flag": "Unclear test set size",
    520       "detail": "Total test set size not explicitly stated in methodology. Different experiments reference different numbers (120 for time analysis). Reproducibility hindered."
    521     }
    522   ],
    523   "cited_papers": [
    524     {
    525       "title": "Verigen: A large language model for verilog code generation",
    526       "relevance": "Prior work on low-level HDL generation; paper positions HLS as more efficient alternative."
    527     },
    528     {
    529       "title": "VerilogEval: Evaluating large language models for verilog code generation",
    530       "relevance": "Benchmark design for HDL evaluation; pass@k metric and evaluation methodology adapted by this work."
    531     },
    532     {
    533       "title": "RTLLM: An open-source benchmark for design rtl generation with large language model",
    534       "relevance": "RTL benchmark demonstrating data scarcity problem that HLS approach aims to mitigate."
    535     },
    536     {
    537       "title": "LLM-VeriPPA: Power, Performance, and Area-aware Verilog Code Generation and Refinement with LLMs",
    538       "relevance": "Feedback-based refinement approach parallel to this work's two-step feedback loop."
    539     },
    540     {
    541       "title": "Code Llama: Open foundation models for code",
    542       "relevance": "Base model (Code-Llama-7B) used for fine-tuning; pre-trained on software code."
    543     },
    544     {
    545       "title": "Evaluating large language models trained on code (CodeX/HumanEval)",
    546       "relevance": "Foundational code generation benchmark; pass@k evaluation metric adopted here."
    547     },
    548     {
    549       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    550       "relevance": "CoT technique adapted for structured hardware design reasoning."
    551     },
    552     {
    553       "title": "RTLFixer: Automatically fixing rtl syntax errors with large language models",
    554       "relevance": "LLM-based feedback and error-driven refinement for RTL; parallel to syntax feedback loop."
    555     }
    556   ],
    557   "engagement_factors": {
    558     "practical_relevance": {
    559       "score": 1,
    560       "justification": "HLS code generation is a niche hardware engineering application; the framework is not released, limiting immediate practitioner use."
    561     },
    562     "surprise_contrarian": {
    563       "score": 1,
    564       "justification": "The HLS-over-HDL argument for LLM generation is reasonable but not surprising; the MachineGen/HumanRefine gap is the most unexpected finding."
    565     },
    566     "fear_safety": {
    567       "score": 0,
    568       "justification": "No safety, security, or risk implications are raised by LLM-assisted HLS code generation."
    569     },
    570     "drama_conflict": {
    571       "score": 0,
    572       "justification": "No controversial claims or challenges to existing work; a straightforward empirical study."
    573     },
    574     "demo_ability": {
    575       "score": 0,
    576       "justification": "No code, data, or demo released; the framework cannot be tried."
    577     },
    578     "brand_recognition": {
    579       "score": 1,
    580       "justification": "Cambridge and Imperial College London are well-known universities but not prominent AI labs in the LLM space."
    581     }
    582   },
    583   "hn_data": {
    584     "threads": [],
    585     "top_points": 0,
    586     "total_points": 0,
    587     "total_comments": 0
    588   }
    589 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs