scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31616B)
      1 {
      2   "paper": {
      3     "title": "LAAFD: LLM-based Agents for Accelerated FPGA Design",
      4     "authors": [
      5       "Maxim Moraru",
      6       "Kamalavasan Kamalakkannan",
      7       "Jered Dominguez-Trujillo",
      8       "Patrick Diehl",
      9       "Atanu Barai",
     10       "Julien Loiseau",
     11       "Zachary Kent Baker",
     12       "Howard Pritchard",
     13       "Galen M. Shipman"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv",
     17     "arxiv_id": "2602.06085"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "LAAFD, an agentic workflow using LLMs (GPT-5, o4-mini, GPT-5-nano), translates C++ kernels into optimized Vitis HLS code for FPGAs. On 15 HPC kernels, the best runs achieve 99.9% geomean performance vs hand-tuned baselines. For stencil workloads, LAAFD matches SODA (a domain-specific HLS generator) while producing 8.3x fewer lines of code. However, results are reported as best-of-multiple-runs, and only GPT-5 achieves high performance across all kernels—o4-mini and GPT-5-nano perform poorly on complex kernels (52.5% and 32.7% geomean respectively).",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No repository URL, code archive, or supplementary material link is provided anywhere in the paper. The LAAFD system, agent prompts, and kernel benchmarks are not released."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The C++ kernels, test benches, HLS reports, and generated code are not released. No download links or data archives are mentioned."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Table III specifies the HLS tool (Vitis 2022.2), FPGA target (xcu250-figd2104-2L-e), and target frequency (200 MHz). However, the software environment for the agentic workflow (Python version, API client versions, orchestration framework) is not specified."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No reproduction instructions, scripts, or step-by-step guides are provided. The workflow is described at a high level but not with enough detail to reproduce."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables I and IV report single point estimates (cycle counts) with no confidence intervals, error bars, or uncertainty measures, despite the authors acknowledging stochastic variation across runs in Section V.E."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests are used. Claims such as '99.9% geomean performance' and comparisons between models are based on raw numbers without any hypothesis testing."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Effect sizes are reported as percentage of ideal/baseline performance: '99.9% geomean' for GPT-5, '52.5%' for o4-mini, '32.7%' for GPT-5-nano (Section V.D). Tables I and IV provide absolute cycle counts alongside ideal minimums, giving full context for the magnitude of differences."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper evaluates 15 kernels plus 7 SODA kernels but provides no justification for why this number is sufficient to support claims about FPGA design automation broadly. No power analysis or sample size rationale is given."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Section V.E acknowledges stochastic variation: 'different runs may yield different results even under identical settings.' Yet only the best results are reported with no standard deviation, range, or distribution across runs. For SODA kernels, 'only one or two [of ten] runs produced HLS code deemed optimal' but no variance is quantified."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Two baselines are included: manually optimized HLS kernels (Table I) and SODA-generated kernels (Table IV). Both represent reasonable comparison points."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper justifies SODA (2018) as 'state-of-the-art DSL-based HLS code generator for stencil solvers' in Section III. The manually tuned baselines are custom implementations, not stale prior work. Recent related tools (HLSPilot, C2HLSC) are discussed but differ in scope."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The system has multiple components (translator, compile fixer, runtime fixer, judge, optimizer) but no ablation study removes individual components to measure their contribution. The LLM comparison (GPT-5 vs o4-mini vs GPT-5-nano) tests model capability, not workflow component importance."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Multiple metrics are reported: execution cycles/latency (Tables I, IV), FPGA resource utilization—LUT, FF, DSP, BRAM (Tables IV, V), lines of code (Tables IV, V), and cyclomatic complexity (Table VI)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation is conducted. The paper claims LAAFD produces 'more readable kernels' and 'high quality, readable, and less complex' code (Section V.C) but uses only automated proxies (LoC, cyclomatic complexity) without human readability assessment."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No separation between development and evaluation sets. The same 15 kernels were presumably used to develop and tune the agentic workflow prompts, and then to report final results. No held-out evaluation is described."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Tables I and IV provide per-kernel cycle counts. Table II maps optimizations per kernel. Table V provides per-kernel resource utilization. Table VI provides per-kernel complexity. Figure 9 shows per-kernel model comparison."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section V.E discusses failures: 'Only one or two runs produced HLS code deemed optimal...the remaining runs produced functionally correct but suboptimal designs.' Context size limits for larger kernels are also discussed. Section V.D reports GPT-5-nano and o4-mini struggling on complex stencil kernels."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "GPT-5-nano (32.7% geomean) and o4-mini (52.5% geomean) perform poorly on complex kernels. LAAFD uses more FPGA resources than manual baselines (Section V.B). Scalability limitations with large kernels are reported (Section V.E). Stochastic failures requiring multiple runs are documented."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims are supported: '99.9% geomean performance' is in Table I; 'matches the performance of SODA' is in Table IV; 'more readable kernels' is supported by LoC/complexity comparisons in Tables IV and VI. The abstract appropriately hedges with 'suggest' for the expertise barrier claim."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims LAAFD 'substantially lowers the expertise barrier to FPGA acceleration' (abstract) and that the workflow 'enables' optimization. These are causal claims but no study of actual developer experience or expertise requirements is conducted. The evidence shows cycle-count parity, not expertise barrier reduction."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Accelerated FPGA Design' broadly, but results cover only 15 HPC kernels on a single FPGA (xcu250) with one tool (Vitis 2022.2). Section V.E notes 'effectiveness depends on kernel complexity' and the Outlook mentions extending to full applications, but the abstract's framing extends beyond what was tested."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations are considered. For example, the near-ideal performance could partly reflect the structured nature of the benchmark kernels rather than general optimization capability. The paper does not discuss whether simpler approaches (template matching, DSL transformation) could achieve similar results."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper measures cycle counts and code complexity but claims to 'lower the expertise barrier to FPGA acceleration.' The gap between the proxy (cycle count parity) and the outcome (reduced expertise requirement) is not acknowledged. No actual user study measures expertise barrier reduction."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Table III lists 'gpt-5, o4-mini, gpt5-nano' without API versions, snapshot dates, or model IDs. These are marketing names that do not uniquely identify the model behavior."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Agent roles are described in natural language (Section IV.A) and example feedback is shown (Figs. 6, 8), but the actual system prompts, judge prompts, and optimizer prompts used in the agentic workflow are not provided. The 'list of possible HLS optimizations' given to the judge is mentioned but not shown."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The iteration limit ('25 optimization iterations') is mentioned for SODA kernels but not for the full suite. No other hyperparameters are documented."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The agentic workflow is described in detail in Section IV.A with a workflow diagram (Fig. 2). Roles of each agent (translator, compile fixer, runtime fixer, judge, optimizer) are explained. Independent context sessions, iteration limits, and the judge-optimizer feedback loop are documented. An illustrative example walks through the full pipeline (Section IV.A.4)."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section IV.B describes kernel design and selection. Input C++ kernels are documented (e.g., Fig. 3). The translation constraints (preserve parameter count, allow type modifications) are specified. SODA kernel origins are cited. Table II maps optimizations per kernel."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section V.E 'Discussions' serves as a substantive limitations section. It discusses stochastic variation across runs, best-of-N reporting methodology, context size limitations for large kernels, and scalability challenges."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section V.E raises study-specific threats: stochastic results requiring multiple runs ('Only one or two [of ten] runs produced optimal code'), context size exceeded for SODA kernels requiring report summarization, and dependence on kernel complexity. These are specific to this work, not boilerplate."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The paper states scope boundaries: 'This study focused on kernels; however, an important next step is to extend the methodology to full applications' (Outlook). Section V.E: 'effectiveness depends on kernel complexity, semantic correctness, and context limitations.' Resource optimization was explicitly excluded: 'the workflow was not instrumented to optimize resource utilization at any stage' (Section V.B)."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw data is released: no generated HLS code, no HLS synthesis reports, no workflow logs, no intermediate outputs. Only summarized cycle counts and resource numbers in tables are provided."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Kernel selection is described in Section IV.B. Each kernel's purpose, dimension, and required optimizations are documented in Tables I and II. SODA kernels are sourced from Chi et al. [1] with equivalent C++ golden models written by the authors."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The kernel selection rationale is described: 'a set of kernels with varying levels of complexity, ranging from simple single-loop structures to deeply nested loops involving multiple memory access operations' (Section IV.B). SODA kernels are explicitly sourced from prior work [1]."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The workflow pipeline is described (Fig. 2), but the data pipeline from experiment execution to reported numbers is not fully documented. How many total runs were executed, how results were aggregated, and what happened with failed runs beyond 'we ran 10 times' for SODA kernels is not specified for the full kernel suite."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The Acknowledgment section states release under LA-UR-26-20594 and identifies Los Alamos National Laboratory, operated by Triad National Security for the National Nuclear Security Administration (DOE contract 89233218CNA000001)."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All nine authors are listed as affiliated with Los Alamos National Laboratory. They evaluate OpenAI models (GPT-5, o4-mini, GPT-5-nano), not their own products, so no product-affiliation conflict exists."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The funder (DOE/NNSA via LANL) has no financial interest in the performance of OpenAI's models or FPGA design tools. The funding is for national laboratory research, independent of the evaluated products."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement is included in the paper. While no obvious financial conflicts are apparent, the absence of a formal declaration does not constitute a declaration of no conflicts."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No training data cutoff dates are stated for GPT-5, o4-mini, or GPT-5-nano. The paper evaluates these models' ability to generate HLS code but does not address when their training data was collected."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the LLMs' training data included similar HLS code patterns. Section VI notes 'the relatively small amount of FPGA-related source code available on GitHub, which likely influenced LLM training' but this tangentially acknowledges limited training data rather than analyzing overlap."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The custom kernels likely reduce contamination risk, but this is not explicitly discussed as a design choice. No contamination analysis is performed. SODA's published DSL code and prior HLS literature could be in training data."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. The evaluation is entirely automated using HLS tools and cycle count comparisons."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. The study evaluates LLM-generated hardware code."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in the study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in the study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section VI: 'the cost of approximately US$50 to translate and optimize all 15 kernels highlights the feasibility of this approach, though server infrastructure costs are not included.' API costs are reported, though server-side HLS synthesis costs are excluded."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The $50 API cost is mentioned but total compute budget is not quantified. HLS synthesis time, number of co-simulations, total wall-clock time, and server compute costs are not reported. For SODA kernels, '10 runs × 25 iterations' is stated but compute time is not."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Section V.E acknowledges stochastic variation: 'different runs may yield different results even under identical settings.' However, only the best results are reported. No seed sensitivity analysis or distribution of outcomes across runs is provided."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section V.E: 'For the more complex SODA kernels, we ran the workflow ten times per kernel with 25 optimization iterations each.' For simpler kernels, 'sometimes even when using a smaller model' suggests fewer runs, though exact counts are not given for all kernels."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search is described. The workflow parameters (iteration limits, agent configurations) appear fixed without justification. The 25-iteration limit for SODA kernels is mentioned but not justified."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Section V.E explicitly states: 'we report for each kernel the best HLS design obtained across multiple executions.' This is best-of-N cherry-picking without reporting the distribution of outcomes. The selection criterion is 'deemed optimal by the judge agent' but this inflates reported performance."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed at all, so multiple comparison correction does not arise. All comparisons are raw number differences."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The manually tuned baselines (Table I) are the authors' own implementations, yet no acknowledgment of self-comparison bias is made. The optimality of the hand-tuned baselines is assumed without independent verification."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No performance-vs-compute analysis. GPT-5 (99.9%) costs more than o4-mini (52.5%) and GPT-5-nano (32.7%), but no cost-performance tradeoff curves are presented. The $50 total cost is not broken down by model or kernel."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper claims the 15 kernels represent 'common compute patterns in HPC' but does not validate this claim. No discussion of whether these kernels adequately represent the breadth of FPGA design challenges, or whether cycle count parity captures meaningful design quality."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "All three LLMs (GPT-5, o4-mini, GPT-5-nano) are evaluated within the same LAAFD scaffold, isolating model differences from scaffold differences. The comparison in Section V.D and Figure 9 uses identical workflow across models."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of temporal leakage. The models could have been trained on HLS examples, SODA papers, or similar kernel implementations without any analysis of this risk."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup leaks information. The agents receive HLS reports and error messages as feedback, which is by design, but no analysis of whether this constitutes information leakage vs. legitimate feedback."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of independence between training data and test kernels. The kernels could share patterns with HLS examples in the models' training data."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention methods are used. No analysis of whether the models have memorized HLS patterns from training."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "LAAFD achieves 99.9% geomean performance compared to hand-tuned HLS baselines across 15 HPC kernels using GPT-5.",
    374       "evidence": "Table I shows per-kernel cycle counts for LAAFD vs manual baselines and ideal minimums. Section V.A computes the geomean. However, Section V.E reveals these are best-of-multiple-runs results.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "LAAFD matches the performance of SODA, a state-of-the-art stencil HLS code generator, on 7 stencil kernels.",
    379       "evidence": "Table IV compares LAAFD-generated vs SODA-generated kernel latencies against ideal. LAAFD achieves comparable or slightly better cycle counts. Best-of-10 runs selection (Section V.E).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "LAAFD generates more readable and less complex code than SODA (8.3x fewer LoC, 2.27x less cyclomatic complexity on average).",
    384       "evidence": "Table IV reports lines of code (LAAFD vs SODA), Table VI reports cyclomatic complexity ratios. These are objective metrics directly measured from the generated code.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "GPT-5 substantially outperforms o4-mini (52.5% geomean) and GPT-5-nano (32.7% geomean) for HLS optimization within LAAFD.",
    389       "evidence": "Section V.D and Figure 9 compare per-kernel performance across three models. GPT-5-nano and o4-mini fail on complex stencil kernels requiring advanced optimizations like perfect data reuse and shift buffers.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The total cost to translate and optimize all 15 kernels is approximately US$50.",
    394       "evidence": "Section VI states this cost figure but provides no breakdown by model, kernel, or run. Server infrastructure costs are explicitly excluded.",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "LAAFD substantially lowers the expertise barrier to FPGA acceleration.",
    399       "evidence": "Inferred from cycle-count parity with hand-tuned baselines (Table I) and automated workflow description (Section IV). No direct measurement of expertise requirements or developer experience.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "Best-of-N reporting inflates performance",
    406       "detail": "Section V.E explicitly states: 'we report for each kernel the best HLS design obtained across multiple executions.' For SODA kernels, only 1-2 of 10 runs produced optimal code. The 99.9% geomean headline represents cherry-picked best cases, not expected performance. No distribution or expected-case performance is reported."
    407     },
    408     {
    409       "flag": "No error bars despite acknowledged stochasticity",
    410       "detail": "The authors acknowledge 'different runs may yield different results even under identical settings' (Section V.E) yet report only single point estimates. The variance across runs could be large given that most runs produce 'functionally correct but suboptimal designs.'"
    411     },
    412     {
    413       "flag": "Custom baselines without independent verification",
    414       "detail": "The manually optimized baselines in Table I are the authors' own implementations. Their optimality is assumed but not independently verified. If the baselines are suboptimal, matching them is less impressive."
    415     },
    416     {
    417       "flag": "Readability claim without human evaluation",
    418       "detail": "The paper claims 'more readable kernels' and 'high quality, readable, and less complex' code based only on LoC and cyclomatic complexity metrics. No human readability assessment is conducted. Fewer lines does not necessarily mean more readable."
    419     },
    420     {
    421       "flag": "No code or data release",
    422       "detail": "The LAAFD system, agent prompts, kernels, test benches, and generated HLS code are not released, making independent verification impossible."
    423     },
    424     {
    425       "flag": "Resource overhead downplayed",
    426       "detail": "LAAFD uses substantially more FPGA resources than baselines (e.g., S3D: 584 BRAM vs 122 manual, S2D2: 90 BRAM vs 28 manual). This is noted but framed as secondary since the workflow 'was not instrumented to optimize resource utilization.' In practice, FPGA resource constraints are critical design considerations."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "A Review on Code Generation with LLMs: Application and Evaluation",
    432       "authors": ["J. Wang", "Y. Chen"],
    433       "year": 2023,
    434       "relevance": "Survey of LLM-based code generation covering application areas and evaluation methods."
    435     },
    436     {
    437       "title": "Test-Driven Development and LLM-based Code Generation",
    438       "authors": ["N. S. Mathews", "M. Nagappan"],
    439       "year": 2024,
    440       "doi": "10.1145/3691620.3695527",
    441       "relevance": "Evaluates test-driven approaches to LLM code generation, relevant to understanding feedback-based code improvement."
    442     },
    443     {
    444       "title": "LLM-Based Test-Driven Interactive Code Generation: User Study and Empirical Evaluation",
    445       "authors": ["S. Fakhoury", "A. Naik", "G. Sakkas", "S. Chakraborty", "S. K. Lahiri"],
    446       "year": 2024,
    447       "relevance": "Empirical evaluation of interactive LLM code generation with user studies, relevant to understanding agentic code workflows."
    448     },
    449     {
    450       "title": "A Survey on Code Generation with LLM-based Agents",
    451       "authors": ["Y. Dong", "X. Jiang", "J. Qian", "T. Wang", "K. Zhang", "Z. Jin", "G. Li"],
    452       "year": 2025,
    453       "arxiv_id": "2508.00083",
    454       "relevance": "Survey specifically covering agentic LLM code generation, directly relevant to LAAFD's approach."
    455     },
    456     {
    457       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    458       "authors": ["S. Thakur"],
    459       "year": 2024,
    460       "doi": "10.1145/3643681",
    461       "relevance": "Demonstrates fine-tuned LLMs for hardware description language generation, foundational work for LLM-based hardware design."
    462     },
    463     {
    464       "title": "RTLCoder: Fully Open-Source and Efficient LLM-Assisted RTL Code Generation Technique",
    465       "authors": ["S. Liu"],
    466       "year": 2025,
    467       "relevance": "Open-source LLM for RTL code generation that outperforms GPT-4, relevant as a baseline approach for hardware code generation."
    468     },
    469     {
    470       "title": "AutoChip: Automating HDL Generation Using LLM Feedback",
    471       "authors": ["S. Thakur", "J. Blocklove", "H. Pearce", "B. Tan", "S. Garg", "R. Karri"],
    472       "year": 2024,
    473       "arxiv_id": "2311.04887",
    474       "relevance": "Iterative LLM-based HDL generation using compilation feedback, a precursor to LAAFD's feedback-driven approach."
    475     },
    476     {
    477       "title": "VerilogCoder: Autonomous Verilog Coding Agents with Graph-based Planning and Abstract Syntax Tree (AST)-based Waveform Tracing Tool",
    478       "authors": ["C.-T. Ho", "H. Ren", "B. Khailany"],
    479       "year": 2025,
    480       "arxiv_id": "2408.08927",
    481       "relevance": "Multi-agent system for Verilog generation achieving 94.2% pass rate, demonstrating agentic approaches for hardware code."
    482     },
    483     {
    484       "title": "HLSPilot: LLM-based High-Level Synthesis",
    485       "authors": ["C. Xiong", "C. Liu", "H. Li", "X. Li"],
    486       "year": 2025,
    487       "doi": "10.1145/3676536.3676781",
    488       "relevance": "Most directly comparable prior work: LLM-based HLS optimization. LAAFD claims to go further in optimization depth and kernel complexity."
    489     },
    490     {
    491       "title": "C2HLSC: Leveraging Large Language Models to Bridge the Software-to-Hardware Design Gap",
    492       "authors": ["L. Collini", "S. Garg", "R. Karri"],
    493       "year": 2025,
    494       "doi": "10.1145/3734524",
    495       "relevance": "C-to-HLS translation using LLMs with HLS report feedback, closest related work to LAAFD's methodology."
    496     },
    497     {
    498       "title": "Are LLMs Any Good for High-Level Synthesis?",
    499       "authors": ["Y. Liao", "T. Adegbija", "R. Lysecky"],
    500       "year": 2025,
    501       "doi": "10.1145/3676536.3699507",
    502       "relevance": "Survey of LLM capabilities for HLS, providing context for the challenges LAAFD addresses."
    503     }
    504   ]
    505 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs