scan-v5.json (26710B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LAAFD: LLM-based Agents for Accelerated FPGA Design", 6 "authors": [ 7 "Maxim Moraru", 8 "Kamalavasan Kamalakkannan", 9 "Jered Dominguez-Trujillo", 10 "Patrick Diehl", 11 "Atanu Barai", 12 "Julien Loiseau", 13 "Zachary Kent Baker", 14 "Howard Pritchard", 15 "Galen M. Shipman" 16 ], 17 "year": 2026, 18 "venue": "arXiv", 19 "arxiv_id": "2602.06085", 20 "doi": null 21 }, 22 "checklist": { 23 "claims_and_evidence": { 24 "abstract_claims_supported": { 25 "applies": true, 26 "answer": true, 27 "justification": "All abstract claims (99.9% performance, SODA parity, readable kernels) are supported by Section V results in Tables I and IV with corresponding cycle count and complexity data.", 28 "source": "haiku" 29 }, 30 "causal_claims_justified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Section V.E states 'only one or two runs produced optimal designs' out of 10 per kernel, yet abstract claims 99.9% performance. Reporting best-of-run results rather than typical performance undermines the causal claim that the workflow reliably produces this quality.", 34 "source": "haiku" 35 }, 36 "generalization_bounded": { 37 "applies": true, 38 "answer": false, 39 "justification": "Tested on 15 kernels on one FPGA (xcu250) with one HLS tool (Vitis 2022.2), yet abstract claims the approach 'substantially lowers the expertise barrier to FPGA acceleration' and conclusion states it 'has the potential to broaden adoption.' Scope far exceeds tested domain.", 40 "source": "haiku" 41 }, 42 "alternative_explanations_discussed": { 43 "applies": true, 44 "answer": false, 45 "justification": "Paper does not discuss whether models are memorizing from training data, whether simpler heuristics could achieve similar results, or whether optimization feedback is actually necessary versus random iteration.", 46 "source": "haiku" 47 }, 48 "proxy_outcome_distinction": { 49 "applies": true, 50 "answer": true, 51 "justification": "Measured outcomes (execution cycles, lines of code, cyclomatic complexity) directly match claimed outcomes (latency performance, code readability) with no conflation.", 52 "source": "haiku" 53 } 54 }, 55 "limitations_and_scope": { 56 "limitations_section_present": { 57 "applies": true, 58 "answer": false, 59 "justification": "No dedicated 'Limitations' or 'Threats to Validity' section. Issues are scattered in Section V.E discussion and conclusion rather than systematically presented.", 60 "source": "haiku" 61 }, 62 "threats_to_validity_specific": { 63 "applies": true, 64 "answer": false, 65 "justification": "Section V.E mentions stochasticity and context limits buried in discussion, but lacks systematic threat analysis. The critical fact that 'only one or two runs produced optimal designs' is treated as an evaluation note, not a validity threat.", 66 "source": "haiku" 67 }, 68 "scope_boundaries_stated": { 69 "applies": true, 70 "answer": false, 71 "justification": "Paper specifies 15 HPC kernels and one FPGA device tested, but does not explicitly state non-applicability to: other HLS tools, embedded FPGAs, kernels exceeding context limits, or production workloads.", 72 "source": "haiku" 73 } 74 }, 75 "conflicts_of_interest": { 76 "funding_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Acknowledgment discloses Los Alamos National Laboratory, Triad National Security, and Department of Energy funding under contract 89233218CNA000001.", 80 "source": "haiku" 81 }, 82 "affiliations_disclosed": { 83 "applies": true, 84 "answer": true, 85 "justification": "All nine authors listed with Los Alamos National Laboratory affiliation in header.", 86 "source": "haiku" 87 }, 88 "funder_independent_of_outcome": { 89 "applies": true, 90 "answer": true, 91 "justification": "Funder (US Department of Energy) is independent from the outcome (LAAFD tool). Not evaluating a vendor's product for the vendor.", 92 "source": "haiku" 93 }, 94 "financial_interests_declared": { 95 "applies": true, 96 "answer": false, 97 "justification": "No competing interests statement, no declaration of patents, equity interests, or consulting relationships related to FPGA tools or synthesis frameworks.", 98 "source": "haiku" 99 } 100 }, 101 "scope_and_framing": { 102 "key_terms_defined": { 103 "applies": true, 104 "answer": false, 105 "justification": "Key terms like 'agent', 'optimization', 'synthesis', 'latency' are used without upfront definitions. Section II provides HLS background but assumes FPGA and hardware design familiarity.", 106 "source": "haiku" 107 }, 108 "intended_contribution_clear": { 109 "applies": true, 110 "answer": true, 111 "justification": "Contributions explicitly stated in Section I: (1) LAAFD workflow, (2) comparison of LLM models, (3) performance evaluation. Intended contribution is unambiguous.", 112 "source": "haiku" 113 }, 114 "engagement_with_prior_work": { 115 "applies": true, 116 "answer": true, 117 "justification": "Section III engages with 20+ prior works (Verigen, RTLCoder, HLSPilot, C2HLSC), explains evolution from one-shot to iterative approaches, and claims 'to the best of our knowledge, this is the first tool capable of reaching minimum theoretical latency across a wide range of kernels.'", 118 "source": "haiku" 119 } 120 } 121 }, 122 "type_checklist": { 123 "empirical": { 124 "artifacts": { 125 "code_released": { 126 "applies": true, 127 "answer": false, 128 "justification": "Paper describes LAAFD system and methodology but does not mention releasing the tool, agents, or test code. No repository, GitHub link, or artifact descriptor provided.", 129 "source": "haiku" 130 }, 131 "data_released": { 132 "applies": true, 133 "answer": false, 134 "justification": "The 15 kernel descriptions are provided in Table I and Figures 3-8, but source code is not released. SODA kernels referenced but not provided. Generated HLS outputs not published.", 135 "source": "haiku" 136 }, 137 "environment_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Table III specifies Vitis 2022.2, FPGA device xcu250-figd2104-2L-e, 200 MHz frequency, 128-bit vector path, and model names (GPT-5, GPT-5-nano, o4-mini). Environment is reproducible.", 141 "source": "haiku" 142 }, 143 "reproduction_instructions": { 144 "applies": true, 145 "answer": false, 146 "justification": "Paper describes the workflow architecture and example (Figures 3-8) but provides no step-by-step instructions for setting up the system, running agents, or interpreting results.", 147 "source": "haiku" 148 } 149 }, 150 "statistical_methodology": { 151 "confidence_intervals_or_error_bars": { 152 "applies": true, 153 "answer": false, 154 "justification": "Tables I and IV report single point estimates (ideal, manual, LAAFD). Figure 9 shows model comparison but no error bars, confidence intervals, or variance estimates.", 155 "source": "haiku" 156 }, 157 "significance_tests": { 158 "applies": true, 159 "answer": false, 160 "justification": "Comparisons are descriptive (99.9% geomean, matches SODA) with no statistical significance tests, p-values, or hypothesis tests reported.", 161 "source": "haiku" 162 }, 163 "effect_sizes_reported": { 164 "applies": true, 165 "answer": true, 166 "justification": "Effect sizes reported as percentages of baseline (99.9%, 99.5%, 52.5%, 32.7%), code complexity ratios (0.89×, 2.27×), and success counts (6/15, 10/15, 15/15 kernels above 98%).", 167 "source": "haiku" 168 }, 169 "sample_size_justified": { 170 "applies": true, 171 "answer": false, 172 "justification": "15 kernels are described as 'representing common compute patterns' but no justification for why 15 specifically. No power analysis or sample size calculation provided.", 173 "source": "haiku" 174 }, 175 "variance_reported": { 176 "applies": true, 177 "answer": false, 178 "justification": "Section V.E states 'we report for each kernel the best HLS design obtained across multiple executions' and notes high run-to-run variance (only 1-2/10 optimal), but no distribution, mean, std dev, or min/max reported.", 179 "source": "haiku" 180 } 181 }, 182 "evaluation_design": { 183 "baselines_included": { 184 "applies": true, 185 "answer": true, 186 "justification": "Hand-tuned baselines provided for all 15 kernels (Table I). SODA framework results provided for 7 stencil kernels (Table IV).", 187 "source": "haiku" 188 }, 189 "baselines_contemporary": { 190 "applies": true, 191 "answer": false, 192 "justification": "SODA baseline is from 2018 (8 years old). Recent tools mentioned in related work (RTLCoder 2025, VerilogCoder 2025, AutoChip, RTLFixer) are not evaluated. Baselines are dated.", 193 "source": "haiku" 194 }, 195 "ablation_study": { 196 "applies": true, 197 "answer": false, 198 "justification": "Section V.D compares three LLM models but no ablation of workflow components (translator, validator, compile fixer, runtime fixer, judge, optimizer). No study of iteration necessity or feedback importance.", 199 "source": "haiku" 200 }, 201 "multiple_metrics": { 202 "applies": true, 203 "answer": true, 204 "justification": "Metrics include execution cycles (latency), FPGA resource usage (LUTs, FFs, DSPs, BRAMs), lines of code, and cyclomatic complexity. Multiple perspectives reported.", 205 "source": "haiku" 206 }, 207 "human_evaluation": { 208 "applies": false, 209 "answer": false, 210 "justification": "Not applicable. System automatically generates and validates code; no subjective human judgment of output quality needed.", 211 "source": "haiku" 212 }, 213 "held_out_test_set": { 214 "applies": true, 215 "answer": false, 216 "justification": "Same 15 kernels used for optimization feedback and final evaluation. Validation uses reference test on the same kernels, not held out. No train/test split.", 217 "source": "haiku" 218 }, 219 "per_category_breakdown": { 220 "applies": true, 221 "answer": true, 222 "justification": "Results broken down by kernel (15 different compute patterns in Table I), by benchmark source (7 SODA kernels in Table IV), and by model (Figure 9 shows GPT-5, GPT-5-nano, o4-mini).", 223 "source": "haiku" 224 }, 225 "failure_cases_discussed": { 226 "applies": true, 227 "answer": false, 228 "justification": "Section V.E briefly mentions 'only one or two runs produced optimal designs' and context size limits for larger kernels, but no systematic analysis or specific failure examples shown.", 229 "source": "haiku" 230 }, 231 "negative_results_reported": { 232 "applies": true, 233 "answer": false, 234 "justification": "Section V.D shows GPT-5-nano and o4-mini underperform on complex kernels (32.7% and 52.5% geomean), and context limitations mentioned, but framed as model comparison rather than negative results.", 235 "source": "haiku" 236 } 237 }, 238 "setup_transparency": { 239 "model_versions_specified": { 240 "applies": true, 241 "answer": false, 242 "justification": "Model names given (gpt-5, gpt-5-nano, o4-mini) but no exact version IDs, snapshot dates, or parameter counts specified. Cannot reproduce with specific model versions.", 243 "source": "haiku" 244 }, 245 "prompts_provided": { 246 "applies": true, 247 "answer": false, 248 "justification": "Section IV describes what agents do conceptually ('translator receives C++ kernel...task is to translate to HLS') but actual system prompts, instructions, or templates are not provided.", 249 "source": "haiku" 250 }, 251 "hyperparameters_reported": { 252 "applies": true, 253 "answer": false, 254 "justification": "Temperature, top-p, sampling strategy not specified. Iteration limits mentioned as 'predefined' and 'fixed' but values not given (Section V.E mentions 25 for SODA kernels but not for others).", 255 "source": "haiku" 256 }, 257 "scaffolding_described": { 258 "applies": true, 259 "answer": true, 260 "justification": "Agentic workflow extensively described in Section IV.A with roles for translator, validator, compile fixer, runtime fixer, judge, and optimizer. Figure 2 and detailed example (Figures 3-8) clarify the pipeline.", 261 "source": "haiku" 262 }, 263 "data_preprocessing_documented": { 264 "applies": false, 265 "answer": false, 266 "justification": "Not applicable. Kernels are direct C++ inputs with reference tests; no preprocessing pipeline for data collection or feature engineering.", 267 "source": "haiku" 268 } 269 }, 270 "data_integrity": { 271 "raw_data_available": { 272 "applies": true, 273 "answer": false, 274 "justification": "Kernel specifications shown in Table I and example code (Figures 3-8), but source code, test benches, and reference implementations not released for independent verification.", 275 "source": "haiku" 276 }, 277 "data_collection_described": { 278 "applies": true, 279 "answer": false, 280 "justification": "'15 kernels representing common compute patterns in HPC' stated but selection criteria not described. SODA kernels taken from [1] but rationale for this set not explained.", 281 "source": "haiku" 282 }, 283 "recruitment_methods_described": { 284 "applies": false, 285 "answer": false, 286 "justification": "Not applicable. No human subjects or data collection from recruitment.", 287 "source": "haiku" 288 }, 289 "data_pipeline_documented": { 290 "applies": false, 291 "answer": false, 292 "justification": "Not applicable in traditional sense. The workflow pipeline (translator → validator → optimizer) is documented, but no data collection or ETL pipeline.", 293 "source": "haiku" 294 } 295 }, 296 "contamination": { 297 "training_cutoff_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "Model training cutoffs not stated for GPT-5, GPT-5-nano, or o4-mini. These are closed-source models with unknown training data cutoff dates.", 301 "source": "haiku" 302 }, 303 "train_test_overlap_discussed": { 304 "applies": true, 305 "answer": false, 306 "justification": "Paper acknowledges 'relatively small amount of FPGA-related source code available on GitHub, which likely influenced LLM training' but does not assess whether these specific 15 kernels could have been in training data.", 307 "source": "haiku" 308 }, 309 "benchmark_contamination_addressed": { 310 "applies": true, 311 "answer": false, 312 "justification": "Kernels are not standard public benchmarks (custom 15 kernels + SODA set), reducing contamination risk. However, no explicit discussion of whether specific optimization patterns might exist in training data.", 313 "source": "haiku" 314 } 315 }, 316 "human_studies": { 317 "applies": false, 318 "answer": false, 319 "justification": "Not applicable. No human participants or user studies.", 320 "source": "haiku" 321 }, 322 "cost_and_practicality": { 323 "inference_cost_reported": { 324 "applies": true, 325 "answer": true, 326 "justification": "Conclusion states 'cost of approximately US$50 to translate and optimize all 15 kernels highlights the feasibility' with caveat that 'server infrastructure costs are not included.'", 327 "source": "haiku" 328 }, 329 "compute_budget_stated": { 330 "applies": true, 331 "answer": false, 332 "justification": "Only aggregate cost (~$50) reported. No per-kernel costs, iteration costs, or breakdown by agent. Infrastructure costs explicitly excluded.", 333 "source": "haiku" 334 } 335 } 336 } 337 }, 338 "claims": [ 339 { 340 "claim": "LAAFD achieves 99.9% geomean performance compared to hand-tuned baselines across 15 kernels", 341 "evidence": "Table I reports cycle counts for all 15 kernels; geometric mean calculated as 99.9% of manual baseline", 342 "supported": "moderate" 343 }, 344 { 345 "claim": "For stencil workloads, LAAFD matches SODA performance while producing more readable code", 346 "evidence": "Table IV shows comparable cycle counts; Table IV shows 8.3× fewer lines of code and 2.27× lower cyclomatic complexity than SODA", 347 "supported": "strong" 348 }, 349 { 350 "claim": "GPT-5 outperforms GPT-5-nano and o4-mini, achieving 99.9% on all 15 kernels", 351 "evidence": "Section V.D and Figure 9 show per-model performance; GPT-5 achieves ≥98% on all 15, others fail on 5-9 kernels", 352 "supported": "strong" 353 }, 354 { 355 "claim": "The workflow generates functionally correct HLS kernels through iterative translation, validation, and optimization", 356 "evidence": "Section IV describes workflow phases; Section IV.B illustrates 7-iteration example where code converges from 65,543 cycles to 16,396 cycles", 357 "supported": "strong" 358 }, 359 { 360 "claim": "Results depend heavily on model quality; only 1-2 of 10 runs achieve theoretical minimum for complex kernels", 361 "evidence": "Section V.E states 'for more complex SODA kernels, we ran 10 times...only one or two runs produced optimal designs'", 362 "supported": "strong" 363 }, 364 { 365 "claim": "Generated LAAFD kernels consume more FPGA resources than hand-tuned or SODA baselines", 366 "evidence": "Tables IV and V show up to 5.7× higher BRAM usage (S3D: 584 vs 122) and generally higher LUT/FF counts", 367 "supported": "strong" 368 }, 369 { 370 "claim": "LLM-based agentic workflows substantially lower the expertise barrier to FPGA acceleration", 371 "evidence": "Conceptual claim in abstract and conclusion; no empirical evidence (no user study, no comparison of developer productivity)", 372 "supported": "unsupported" 373 }, 374 { 375 "claim": "LAAFD solves context size limitations by removing HLS report details or eliminating the judge session", 376 "evidence": "Section V.E mentions 'we addressed this by summarizing HLS reports or...removing the dedicated judge session' but scope and impact not quantified", 377 "supported": "weak" 378 } 379 ], 380 "methodology_tags": [ 381 "benchmark-eval", 382 "case-study", 383 "empirical" 384 ], 385 "key_findings": "LAAFD uses a multi-agent LLM-based workflow (translator, validator, fixer components, judge, optimizer) to convert C++ kernels to optimized Vitis HLS code, achieving 99.9% geomean performance versus hand-tuned baselines on 15 HPC kernels and matching SODA on stencil workloads. The workflow is inherently stochastic, with only 1-2 out of 10 runs producing theoretically optimal designs for complex kernels; GPT-5 substantially outperforms GPT-5-nano (99.9% vs 32.7%) and o4-mini (99.9% vs 52.5%). While LAAFD produces more readable code than SODA (8.3× fewer lines, 2.27× lower complexity), generated kernels consume significantly more FPGA resources (up to 5.7× more BRAMs in S3D), and the workflow encounters context size limitations on large kernels.", 386 "red_flags": [ 387 { 388 "flag": "Best-of-run cherry-picking", 389 "detail": "Section V.E reports 'the best HLS design obtained across multiple executions' but notes that 'only one or two runs produced optimal designs' out of 10. Reporting 99.9% performance obscures that most runs were suboptimal. No distribution of run outcomes provided." 390 }, 391 { 392 "flag": "No variance reported", 393 "detail": "High run-to-run stochasticity acknowledged (10 runs, 25 iterations per complex kernel) but results shown only as best-case. Mean, median, std dev, min/max performance not provided, making typical performance unknown." 394 }, 395 { 396 "flag": "Small, unjustified sample size", 397 "detail": "15 kernels chosen to 'represent common compute patterns' but no justification for why 15 vs 10 vs 50. No power analysis or sample size calculation." 398 }, 399 { 400 "flag": "Dated baseline comparisons", 401 "detail": "SODA baseline from 2018 (8 years old). Recent tools mentioned in related work (RTLCoder 2025, VerilogCoder 2025, AutoChip 2024) not evaluated, preventing assessment vs state-of-the-art." 402 }, 403 { 404 "flag": "Model versions not specified", 405 "detail": "Model names given (GPT-5, GPT-5-nano, o4-mini) but no snapshot dates, version IDs, or parameter counts. Cannot reproduce with the exact same models." 406 }, 407 { 408 "flag": "System prompts not released", 409 "detail": "Paper describes what agents do conceptually but provides no actual system instructions, prompt templates, or few-shot examples. Cannot reproduce the exact guidance given to models." 410 }, 411 { 412 "flag": "Hyperparameters underspecified", 413 "detail": "Temperature, top-p, iteration limits stated as 'predefined' and 'fixed' (25 iterations mentioned for SODA kernels in V.E but not for others), validation thresholds not quantified." 414 }, 415 { 416 "flag": "Resource overhead unexplored", 417 "detail": "Section V.B notes 'workflow was not instrumented to optimize resource utilization' and that LAAFD 'generally incurs higher resource usage.' BRAM usage up to 5.7× baseline suggests potential productionization barrier not addressed." 418 }, 419 { 420 "flag": "Context size limitations mentioned but not solved", 421 "detail": "Section V.E notes 'executions sometimes exceeded maximum context size' for larger kernels; remedy (summarizing reports, removing judge) degrades quality but scope/frequency of this issue not quantified." 422 }, 423 { 424 "flag": "No ablation study", 425 "detail": "Cannot determine which workflow components are necessary. No study of feedback importance vs random iteration, or necessity of compile fixer vs runtime fixer." 426 }, 427 { 428 "flag": "Training data contamination not addressed", 429 "detail": "Paper acknowledges limited FPGA code in training but does not assess whether these specific 15 kernels or the optimization patterns might have been in GPT-5 training data." 430 }, 431 { 432 "flag": "No statistical significance testing", 433 "detail": "Comparisons reported as descriptive percentages (99.9%, 52.5%) with no p-values, confidence intervals, or significance tests." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "SODA: Stencil with Optimized Dataflow Architecture", 439 "relevance": "State-of-the-art stencil HLS generator; primary baseline for comparison on 7 stencil kernels (Table IV)" 440 }, 441 { 442 "title": "StencilFlow: Mapping large stencil programs to distributed spatial computing systems", 443 "relevance": "Prior work on stencil compilation; referenced for domain-specific HLS approaches" 444 }, 445 { 446 "title": "HLS4ML: An open-source codesign workflow to empower scientific low-power machine learning devices", 447 "relevance": "High-level synthesis framework for ML; demonstrates application of HLS to specialized domain" 448 }, 449 { 450 "title": "VeriGen: A Large Language Model for Verilog Code Generation", 451 "relevance": "RTL generation using LLMs; foundational work on LLM-based hardware synthesis (Thakur et al. 2024)" 452 }, 453 { 454 "title": "RTLCoder: Fully Open-Source and Efficient LLM-Assisted RTL Code Generation Technique", 455 "relevance": "Iterative RTL synthesis with EDA feedback; similar agentic approach applied to Verilog (Liu et al. 2025)" 456 }, 457 { 458 "title": "C2HLSC: Leveraging Large Language Models to Bridge the Software-to-Hardware Design Gap", 459 "relevance": "LLM-based C-to-HLS translation with design space exploration; prior work on same problem (Collini et al. 2025)" 460 }, 461 { 462 "title": "A Survey on Code Generation with LLM-based Agents", 463 "relevance": "Recent survey of agentic code generation methods; contextualizes LAAFD within broader LLM agent ecosystem (Dong et al. 2025)" 464 }, 465 { 466 "title": "AutoChip: Automating HDL Generation Using LLM Feedback", 467 "relevance": "Multi-turn HDL generation with LLM-as-judge pattern; precursor to judge-based optimization strategy" 468 } 469 ], 470 "engagement_factors": { 471 "practical_relevance": { 472 "score": 2, 473 "justification": "Tool could help HLS developers, but is not released; resource overhead (5.7× BRAMs in some cases) raises productionization concerns for resource-constrained FPGAs." 474 }, 475 "surprise_contrarian": { 476 "score": 1, 477 "justification": "LLMs for code generation and optimization well-established; applying them to HLS is somewhat novel but unsurprising given trends in hardware-software co-design." 478 }, 479 "fear_safety": { 480 "score": 0, 481 "justification": "No AI safety concerns; paper focuses on productivity and code generation, not autonomy or alignment." 482 }, 483 "demo_ability": { 484 "score": 0, 485 "justification": "Tool not released; no code repository, no online demo, no downloadable artifact. Readers cannot try LAAFD themselves." 486 }, 487 "drama_conflict": { 488 "score": 0, 489 "justification": "Straightforward engineering paper with no controversial claims, no disputes with prior work, no shocking findings." 490 }, 491 "brand_recognition": { 492 "score": 1, 493 "justification": "Los Alamos National Laboratory carries credibility in HPC, but not as high-profile as OpenAI/DeepMind/Google for LLM work." 494 } 495 }, 496 "hn_data": { 497 "threads": [ 498 { 499 "hn_id": "46893430", 500 "title": "The Trigger in the Haystack: Extracting and Reconstructing LLM Backdoor Triggers", 501 "points": 1, 502 "comments": 0, 503 "url": "https://news.ycombinator.com/item?id=46893430", 504 "created_at": "2026-02-04T23:28:00Z" 505 } 506 ], 507 "top_points": 1, 508 "total_points": 1, 509 "total_comments": 0 510 } 511 }