scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24844B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Code Language Models for Automated HLS-based Hardware Generation: Benchmark, Infrastructure and Analysis",
      6     "authors": [
      7       "Jiahao Gai",
      8       "Hao (Mark) Chen",
      9       "Zhican Wang",
     10       "Hongyu Zhou",
     11       "Wanru Zhao",
     12       "Nicholas Lane",
     13       "Hongxiang Fan"
     14     ],
     15     "year": 2025,
     16     "venue": "Asia and South Pacific Design Automation Conference (ASP-DAC'25)",
     17     "arxiv_id": "2502.13921",
     18     "doi": "10.1145/3658617.3697616"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims (LLMs for HLS, superiority over Verilog, effectiveness of CoT+feedback) are supported by ablation studies in Section 5.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Ablation studies (Sections 5.2-5.4) isolate effects of fine-tuning, CoT, and feedback loops with appropriate baselines.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Claims appropriately bounded to HLS on the collected benchmark. Authors acknowledge 'limited diversity of hardware designs' as a limitation (Section 5.8).",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Section 5.7 explains the MachineGen vs HumanRefine gap by model training bias, prompt complexity, and information density; Section 5.8 discusses multi-factor hypotheses.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Paper clearly distinguishes syntax correctness (GCC -fsyntax-only) from functional correctness (unit test output matching), reporting both separately throughout.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Dedicated 'Limitations' subsection in Section 5.8 lists: unavailable advanced models (DeepSeek-R1), unexplored test-time scaling, limited benchmark diversity.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Specific threats stated: limited HLS design diversity, model overfitting to machine-generated prompts (47% vs 94% performance gap), limited generalization without feedback loops in complex tasks.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Explicitly scoped: C-based HLS only (footnote), no hardware performance optimization in feedback, evaluation on Vivado-HLS only. Does not claim broader applicability.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding acknowledgment or statement appears in the provided paper text.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All five authors' affiliations clearly listed: Imperial College London, University of Cambridge, Shanghai Jiao Tong University, University of Sydney.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Funding not disclosed, so independence cannot be evaluated.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or financial declarations visible in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms defined: HLS explained as C-based alternative requiring fewer tokens (Figure 2), pass@k metric defined, hardware performance defined as latency/power/area.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three explicit contributions listed: (1) fine-tuned models on 40K HLS dataset, (2) end-to-end generation framework with evaluation infrastructure, (3) CoT and feedback loop optimization techniques.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 comprehensively reviews LLM-assisted code generation and hardware generation literature; positions work as 'first step to investigate HLS code generation with LLM' with unique benchmark and infrastructure contributions.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Paper describes framework and evaluation infrastructure but does not explicitly state that code, fine-tuned models, or benchmark are released. No repository or data availability statement provided.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "The 42,000 HLS dataset collected from open-source is not stated to be released. Sources (HLSyn, ML4Accel) are open but derived dataset availability not mentioned.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "Detailed specs provided: Code-Llama-7B, QLoRA, 8-bit loading, sequence length 4096, warmup 100 steps, gradient accumulation 4, batch sizes specified, hardware (4x L20 GPUs, 80 vCPU Xeon), Vivado 2020.1.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "While pipeline stages are described, no step-by-step reproduction instructions are provided. No code repository, data download links, or exact command sequences for replication.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No confidence intervals, error bars, or variance estimates reported for any primary results. Pass@3 percentages shown as point estimates only.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests (t-tests, chi-square, etc.) reported. Only raw percentage comparisons provided.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Percentage improvements reported (e.g., 54.85%→88.44% for syntax, 0%→53.20% for functionality), providing absolute effect magnitudes.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No justification for test set size (52 base designs, ~10 variants per category in test split). No power analysis or sample size calculation provided.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Pass@3 metric implies 3 samples but aggregate results show no variance/std dev. Single values reported for latency and resource usage (Table 1).",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Ablations compare finetuned vs non-finetuned, with/without CoT, with/without feedback loops. Non-finetuned baseline provides key comparison point.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Uses Code-Llama-7B (2023) and StarCoder. Contemporary with 2025 publication. However, acknowledges missing DeepSeek-R1 and test-time scaling.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Comprehensive ablations: fine-tuning (5.2), CoT (5.3), syntax feedback (5.4), functionality feedback (5.4), task complexity (5.6), prompt type (5.7).",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Multiple metrics reported: syntax correctness, functional correctness (both pass@3), latency (ms), resource usage (LUTs, registers, DSPs, BRAMs).",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "No human evaluation of generated code. Unit tests are automated. Not relevant given task is code generation with objective correctness criteria.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Dataset split 4:1 training:test. Held-out test set used for all evaluations in Sections 5.2-5.7.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Table 2 breaks results by complexity (Easy/Medium/Difficult); Table 3 shows MachineGen vs HumanRefine; Table 1 shows per-design latency/resource breakdown.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 5.6 analyzes failure pattern: performance degrades with code complexity (96.67%→90% syntax, 63.33%→53.33% function). Hypothesizes absence of feedback loops limits self-correction on complex tasks.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "HumanRefine prompts show dramatic failure: 47.29% syntax vs 93.83% MachineGen, 21.36% vs 62.24% functionality. Honestly reported as evidence of model limitations.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Code-Llama-7B explicitly specified. ChatGPT 3.5 and 4 for description generation. Snapshot dates/exact commit hashes not provided.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Base instruction prompt shown ('Generate HLS code with...'). CoT prompt explicitly provided in Figure 5 with all four reasoning steps.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Warmup 100, gradient accumulation 4, micro-batch 4, inference batch 2, sequence length 4096 reported. Sampling parameters (temperature, top-p) not specified.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Two-stage framework clearly described: (1) fine-tuning with QLoRA, (2) iterative generation with CoT and two-step feedback loop (syntax then function). Figure 4 provides flowchart.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Collection from HLSyn/ML4Accel repos, 52 base designs × pragma combinations → 42K variants, invalid programs filtered. Test split provided in two versions (MachineGen, HumanRefine). Process reasonably documented.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Base designs sourced from open repositories (HLSyn, ML4Accel) but derived 42K-program dataset not stated to be publicly available.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Collection method clear: 52 designs from open-source, combined with HLS pragmas (PIPELINE, PARALLEL, TILE), invalid programs filtered, 4:1 train/test split described.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human subjects involved. N/A.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "Full pipeline documented: open-source collection → pragma combinations → filtering → ChatGPT description generation → 4:1 split → evaluation with syntax/functional checks.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "Code-Llama training cutoff date not explicitly stated. HLS designs sourced from GitHub but collection date not specified, raising risk of contamination with pre-training data.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of potential overlap between pre-training data and HLS designs collected from GitHub. Given use of open-source code, some designs may have appeared in training.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "No analysis of whether benchmark examples were available before Code-Llama training cutoff. HLS designs from GitHub repos of unknown vintage create unquantified contamination risk.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human subjects. N/A.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human subjects. N/A.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human subjects. N/A.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human subjects. N/A.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human subjects. N/A.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human subjects. N/A.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human subjects. N/A.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Inference latency reported: 7s (w/o feedback), 9s (syntax), 11s (function) for 120 data points. Does not report token count, energy consumption, or monetary cost despite claiming energy-efficiency.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Hardware used specified (4x L20 GPU, 80 vCPU, 100GB RAM) but total computational budget, training time, or cost not quantified.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "HLS-based designs require 3-4x fewer tokens than Verilog-based designs",
    377       "evidence": "Figure 2 shows token comparison: HLS normalized to ~25%, Verilog to ~100%",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Fine-tuning dramatically improves hardware code generation capability",
    382       "evidence": "Section 5.2: syntax 54.85%→88.44%, functionality 0%→53.20% with fine-tuning",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Chain-of-thought prompting enhances HLS generation quality",
    387       "evidence": "Section 5.3: syntax 88.44%→94.33%, functionality 53.20%→61.45% with CoT",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Iterative feedback loops improve code generation with diminishing returns",
    392       "evidence": "Sections 5.4: first feedback loop provides substantial improvement; second iteration shows diminishing returns in Figures 7-8",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Code complexity inversely correlates with generation success",
    397       "evidence": "Table 2: Easy 96.67% syntax vs Difficult 90%, Easy 63.33% vs Difficult 53.33% functionality",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Models are strongly biased toward machine-generated prompts",
    402       "evidence": "Table 3: MachineGen 93.83% syntax vs HumanRefine 47.29%, an ~46pp gap suggesting overfitting to synthetic format",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Generated HLS designs synthesize efficiently on real FPGAs",
    407       "evidence": "Table 1: 9 designs synthesize to reasonable latencies (0.3-579ms) and resource usage on Xilinx VCU118",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "empirical"
    414   ],
    415   "key_findings": "Fine-tuning pre-trained language models on a collected HLS dataset dramatically improves code generation from 0% to 53% functional correctness. Chain-of-thought prompting and iterative feedback loops provide additional improvements (final 62% functional). However, the model exhibits severe overfitting to machine-generated prompts (94% syntax) compared to human-refined prompts (47% syntax), suggesting limited real-world applicability. Performance degrades significantly with code complexity and on held-out test prompts.",
    416   "red_flags": [
    417     {
    418       "flag": "Tiny benchmark with synthetic diversity",
    419       "detail": "Only 52 base designs expanded to 42K via pragma combinations. Authors acknowledge 'limited diversity of hardware designs' (Section 5.8). Generalization to unseen design patterns unvalidated."
    420     },
    421     {
    422       "flag": "Dramatic prompt-type distribution shift",
    423       "detail": "Model scores 93.83% on machine-generated vs 47.29% on human-refined prompts (Table 3). Indicates overfitting to synthetic training prompt format, severely limiting practical deployment."
    424     },
    425     {
    426       "flag": "No comparison with prior hardware generation methods",
    427       "detail": "No comparative evaluation against VerilogEval, RTLFixer, LLM-VeriPPA, or other Verilog/RTL generation approaches. Cannot assess whether HLS actually improves over the claimed alternatives."
    428     },
    429     {
    430       "flag": "Synthetic training descriptions from ChatGPT",
    431       "detail": "All 42K descriptions generated by ChatGPT 3.5/4 rather than human-written. Introduces potential quality inconsistency, data contamination risk if ChatGPT saw HLS repositories, and learning from AI-generated text."
    432     },
    433     {
    434       "flag": "No statistical variance or significance testing",
    435       "detail": "Zero confidence intervals, error bars, or hypothesis tests. Pass@3 percentages reported as point estimates. Unclear if improvements are statistically significant or due to sampling noise."
    436     },
    437     {
    438       "flag": "Unvalidated pass@k metric",
    439       "detail": "Pass@3 chosen without justification. Why 3 samples? Is this standard for hardware generation? No ablation on k parameter."
    440     },
    441     {
    442       "flag": "Potential training-test contamination",
    443       "detail": "HLS designs collected from open GitHub repositories; Code-Llama training cutoff not specified. Designs may have appeared in pre-training, inflating apparent performance."
    444     },
    445     {
    446       "flag": "Missing comparison with concurrent LLM approaches",
    447       "detail": "No comparison with GPT-4, Sonnet, or other state-of-the-art models available at submission. Only fine-tuned 7B models evaluated."
    448     },
    449     {
    450       "flag": "Hardware performance claims unsupported",
    451       "detail": "Table 1 shows designs fit on FPGA but includes no optimization step and no comparison of area/power efficiency. Claims about HLS efficiency vs Verilog are inferred, not measured."
    452     },
    453     {
    454       "flag": "Code and data not released",
    455       "detail": "No statement that fine-tuned models, 42K dataset, or framework code are publicly available. Reproducibility impossible without these artifacts."
    456     }
    457   ],
    458   "cited_papers": [
    459     {
    460       "title": "CodeX: Evaluating large language models trained on code",
    461       "relevance": "Foundational work on LLM code generation; establishes HumanEval benchmark referenced in this work"
    462     },
    463     {
    464       "title": "StarCoder: may the source be with you",
    465       "relevance": "Major code LLM baseline model; used as base for HLS fine-tuning in this work"
    466     },
    467     {
    468       "title": "VerilogEval: Evaluating large language models for Verilog code generation",
    469       "relevance": "Prior work on LLM hardware generation (Verilog); direct precedent for HLS-based approach"
    470     },
    471     {
    472       "title": "Verigen: A large language model for Verilog code generation",
    473       "relevance": "HDL-focused prior work; establishes baseline for comparison of HLS vs low-level hardware languages"
    474     },
    475     {
    476       "title": "LLM-VeriPPA: Power, Performance, and Area-aware Verilog Code Generation",
    477       "relevance": "Recent Verilog generation with performance optimization; closest related work to this HLS approach"
    478     },
    479     {
    480       "title": "RTLFixer: Automatically fixing RTL syntax errors with large language models",
    481       "relevance": "Prior feedback loop approach for hardware debugging; informs two-step feedback design in this work"
    482     },
    483     {
    484       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    485       "relevance": "Foundational CoT technique; applied here to HLS code generation with hardware-specific reasoning steps"
    486     },
    487     {
    488       "title": "QLoRA: Efficient finetuning of quantized LLMs",
    489       "relevance": "Fine-tuning technique used in this work for efficient 7B model training on limited hardware"
    490     }
    491   ],
    492   "engagement_factors": {
    493     "practical_relevance": {
    494       "score": 2,
    495       "justification": "HLS generation could aid hardware design, but severe overfitting to synthetic prompts (47% on human prompts) and small benchmark limit immediate practical utility."
    496     },
    497     "surprise_contrarian": {
    498       "score": 1,
    499       "justification": "Finding that HLS outperforms Verilog is unsurprising given HLS similarity to software languages. The human-prompt failure is notable but framed as limitation, not insight."
    500     },
    501     "fear_safety": {
    502       "score": 0,
    503       "justification": "No AI safety or security concerns raised. Paper focuses on code generation capability, not misuse risks."
    504     },
    505     "drama_conflict": {
    506       "score": 0,
    507       "justification": "Incremental technical contribution. No contested claims, methodology debates, or controversy."
    508     },
    509     "demo_ability": {
    510       "score": 2,
    511       "justification": "Fine-tuned HLS models can generate working hardware, but code/models not released. Readers cannot run or test the approach."
    512     },
    513     "brand_recognition": {
    514       "score": 2,
    515       "justification": "Imperial College and Cambridge are prestigious, but paper uses standard base models (Code-Llama, StarCoder) with no novel architectural contributions."
    516     }
    517   },
    518   "hn_data": {
    519     "threads": [],
    520     "top_points": 0,
    521     "total_points": 0,
    522     "total_comments": 0
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs