scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23070B)
      1 {
      2   "paper": {
      3     "title": "A2H-MAS: An Algorithm-to-HLS Multi-Agent System for Automated and Reliable FPGA Implementation",
      4     "authors": [
      5       "Jie Lei",
      6       "Ruofan Jia",
      7       "J. Andrew Zhang",
      8       "Hao Zhang"
      9     ],
     10     "year": 2025,
     11     "arxiv_id": "2508.10904"
     12   },
     13   "scan_version": 3,
     14   "active_modules": [
     15     "experimental_rigor"
     16   ],
     17   "methodology_tags": [
     18     "case-study",
     19     "benchmark-eval"
     20   ],
     21   "key_findings": "A2H-MAS decomposes MATLAB-to-HLS conversion into 8 specialized agent phases with standardized interfaces and deterministic validation. On 5G NR and WLAN synchronization tasks, the system produces functionally correct hardware designs operating at 292 and 338 MHz respectively. Ablation shows that algorithm-level restructuring (Adaptation) provides the largest resource reductions (e.g., 98% LUT reduction for calcThreshold), with Refinement providing further gains. The Direct LLM translation baseline failed timing closure on one module.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No datasets, MATLAB source files, or HLS outputs are released. The test algorithms are described but not made available."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions Xilinx Vitis HLS, MATLAB, and NI USRP X310 but provides no version numbers, dependency specifications, or environment setup details."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Results in Tables I and II report only point estimates for resource usage, clock frequency, and latency with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims A2H-MAS is effective compared to direct translation but provides no statistical significance tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Concrete resource reductions with baseline context are reported, e.g., 'LUT consumption is reduced from 36,500 to 685 for calcThreshold' (Section V-B), providing magnitude of effect."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Only two wireless communication systems tested with a handful of submodules. No justification for why this sample is sufficient."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "All results appear to be from single runs. No variance, standard deviation, or multiple-run results are reported."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The ablation study in Table II compares Direct (naive LLM translation), Adaptation, and Refinement strategies."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No comparison against other contemporary LLM-based hardware generation systems (VeriMind, HLSPilot, HDLAgent, AutoChip) despite discussing them in related work. The only baseline is the authors' own naive Direct translation."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table II presents ablation results comparing Direct, Adaptation, and Refinement stages on calcThreshold and extractSSBsig modules."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Results report LUTs, FFs, DSP, BRAMs, clock frequency (MHz), and latency — multiple complementary hardware metrics."
     92       },
     93       "human_evaluation": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "Human evaluation is not relevant here; correctness is verified through automated simulation (C simulation, synthesis, RTL co-simulation) and on-board hardware validation."
     97       },
     98       "held_out_test_set": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "Not a ML model evaluated on train/test splits. The system is tested on engineering tasks with deterministic correctness criteria."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table I provides per-submodule breakdowns for all modules in both 5G NR (5 submodules + top) and WLAN (4 submodules + top) tasks."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The Direct strategy for calcThreshold 'Failed' to achieve post-route timing closure (Table II), explicitly reported and discussed."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The Direct baseline failing timing closure for calcThreshold is a negative result. Increased BRAM usage from integration overhead is also noted."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of 'functionally correct, resource-efficient, and latency-optimized HLS designs' are supported by Tables I and II showing working implementations with specific resource and timing numbers."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The ablation study (Direct → Adaptation → Refinement) uses controlled single-variable manipulation to show each stage's causal contribution to resource reduction."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims general 'Algorithm-to-HLS' capability, but results are limited to two wireless communication tasks. The conclusion mentions extending to 'computer vision and signal processing' without bounding current claims."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No discussion of alternative explanations. Whether improvements stem from the multi-agent architecture vs. the knowledge library vs. specific algorithmic transformations is not disentangled beyond the 3-level ablation."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures functional correctness (C simulation pass), resource usage (LUTs, FFs, DSP, BRAM), and clock frequency, then frames the system as producing 'reliable and high-quality hardware implementations' and demonstrating 'effectiveness and robustness for complex hardware development workflows.' The gap between measured metrics (correctness + resources on 2 tasks) and the broader claims (reliable, robust, complex workflows) is not acknowledged."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Section V states 'Claude Code was employed' but provides no model version, snapshot date, or API version. Reference [6] cites 'Claude sonnet 4' without a specific version identifier."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Figures 2 and 3 show structured prompt templates with agent type, core mission, input/output parameters, workflow phases, and tool commands. Substantial detail on agent prompt structure is provided."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the Claude Code usage."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The multi-agent scaffolding is described in detail: Sections III and IV cover standardized interfaces (Fig. 2), rule-guided workflows (Fig. 3), deterministic tool usage, feedback mechanisms, and the 8-phase pipeline (Fig. 5)."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Preprocessing is documented: modularization (Phase I), test data generation from intermediate variables (Phase II), function flattening (Phase III), with standardized naming conventions."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations or threats-to-validity section. The conclusion mentions future work but does not discuss limitations of the current system."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No specific threats to validity are discussed anywhere in the paper."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit statements about what the results do NOT show. Future work mentions extending to other domains but does not state specific scope boundaries for current claims."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw data (MATLAB source files, generated HLS code, synthesis reports) is made available for independent verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section IV-B describes test data generation: executing the original algorithm, recording intermediate variables, and storing with standardized naming conventions."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The study evaluates automated hardware generation on specific algorithms."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline from MATLAB input through modularization, flattening, optimization, translation, refinement, and integration is documented in Section IV with figures."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information or acknowledgments section is present in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Technology Sydney and Xidian University."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses Anthropic's Claude Code but does not disclose any relationship with Anthropic."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The paper evaluates a multi-agent system's engineering outputs, not a pre-trained model's knowledge on a standard benchmark."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Not evaluating a pre-trained model on a benchmark; evaluating a tool pipeline on custom engineering tasks."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No standard benchmark evaluation of model knowledge is conducted."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No API costs, token consumption, or wall-clock time for the LLM-based code generation process is reported despite using Claude Code extensively."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget, API spend, or hardware resources used for the generation process is stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "LLM outputs are non-deterministic, but no sensitivity analysis across multiple runs is reported. All results appear to be from single runs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is not stated. Results appear to be single-run."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The Refinement phase includes design space exploration (DSE) but no budget (number of configurations tried, compute spent on search) is reported."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "DSE is mentioned in Phase VII but no details on how many alternatives were explored or how the best configuration was selected."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own system against their own naive baseline (Direct translation). No acknowledgment of self-comparison bias or independent evaluation."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Adaptation and Refinement stages require additional LLM calls and synthesis runs compared to Direct translation, but compute costs are not compared across the three strategies."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether the two wireless communication tasks are representative of the broader claim of 'automated and reliable FPGA implementation.'"
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The ablation compares Direct (single LLM call) vs Adaptation vs Refinement (multi-agent pipeline with knowledge library), but these differ in both algorithmic approach AND scaffolding complexity. The paper does not discuss whether improvements stem from the multi-agent scaffold vs. the algorithmic transformations vs. the knowledge library, attributing all gains to the system as a whole."
    345       }
    346     }
    347   },
    348   "claims": [
    349     {
    350       "claim": "A2H-MAS consistently produces functionally correct, resource-efficient, and latency-optimized HLS designs for wireless communication algorithms.",
    351       "evidence": "Tables I and II show implementation results for 5G NR SSB detection (292.23 MHz top-level) and WLAN synchronization (337.61 MHz top-level), with all submodules passing C simulation, synthesis, and RTL co-simulation (Section V).",
    352       "supported": "moderate"
    353     },
    354     {
    355       "claim": "Algorithm-level transformation yields order-of-magnitude improvements over direct LLM translation.",
    356       "evidence": "Table II ablation: calcThreshold LUTs reduced from 36,500 (Direct, failed timing) to 685 (Adaptation) to 173 (Refinement). extractSSBsig LUTs from 4,468 to 275 to 155.",
    357       "supported": "moderate"
    358     },
    359     {
    360       "claim": "The multi-agent design mitigates hallucination, forgetting, and instability of LLMs.",
    361       "evidence": "Claimed throughout Sections I and III but supported only by the ablation showing the staged pipeline produces better results than direct translation. No direct measurement of hallucination rates or comparison with single-agent approaches.",
    362       "supported": "weak"
    363     }
    364   ],
    365   "red_flags": [
    366     {
    367       "flag": "Very narrow evaluation scope",
    368       "detail": "Only two wireless communication systems tested with a handful of submodules. Claims of general effectiveness for 'automated and reliable FPGA implementation' are not well-supported."
    369     },
    370     {
    371       "flag": "No comparison with existing systems",
    372       "detail": "Despite discussing AutoChip, HDLAgent, VeriMind, and HLSPilot in related work, no direct experimental comparison is made against any of them."
    373     },
    374     {
    375       "flag": "No reproducibility artifacts",
    376       "detail": "No code, data, or environment specifications released. The exact LLM model version is not specified (only 'Claude Code')."
    377     },
    378     {
    379       "flag": "No variance or repeatability analysis",
    380       "detail": "All results are single-run with no indication of result stability, which is important given LLM non-determinism."
    381     },
    382     {
    383       "flag": "No limitations section",
    384       "detail": "The paper has no discussion of limitations, threats to validity, or scope boundaries despite making broad claims."
    385     }
    386   ],
    387   "cited_papers": [
    388     {
    389       "title": "ChatDev: Communicative Agents for Software Development",
    390       "authors": [
    391         "C. Qian",
    392         "W. Liu",
    393         "H. Liu"
    394       ],
    395       "year": 2023,
    396       "arxiv_id": "2307.07924",
    397       "relevance": "Multi-agent framework for software development, directly relevant to agentic AI coding systems."
    398     },
    399     {
    400       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    401       "authors": [
    402         "S. Hong",
    403         "M. Zhuge",
    404         "J. Chen"
    405       ],
    406       "year": 2024,
    407       "relevance": "Multi-agent collaboration framework for programming tasks."
    408     },
    409     {
    410       "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery",
    411       "authors": [
    412         "A. Novikov"
    413       ],
    414       "year": 2025,
    415       "arxiv_id": "2506.13131",
    416       "relevance": "LLM-based coding agent for algorithmic discovery, relevant to agentic AI capabilities."
    417     },
    418     {
    419       "title": "VeriMind: Agentic LLM for Automated Verilog Generation with a Novel Evaluation Metric",
    420       "authors": [
    421         "B. Nadimi",
    422         "G. O. Boutaib",
    423         "H. Zheng"
    424       ],
    425       "year": 2025,
    426       "arxiv_id": "2503.16514",
    427       "relevance": "Multi-agent system for hardware code generation, directly comparable approach."
    428     },
    429     {
    430       "title": "AutoChip: Automating HDL Generation Using LLM Feedback",
    431       "authors": [
    432         "S. Thakur",
    433         "J. Blocklove",
    434         "H. Pearce"
    435       ],
    436       "year": 2023,
    437       "arxiv_id": "2311.04887",
    438       "relevance": "LLM-based iterative hardware code generation with compiler feedback loops."
    439     },
    440     {
    441       "title": "HLSPilot: LLM-based High-Level Synthesis",
    442       "authors": [
    443         "C. Xiong",
    444         "C. Liu",
    445         "H. Li",
    446         "X. Li"
    447       ],
    448       "year": 2024,
    449       "relevance": "LLM-based HLS code generation system, direct competitor to A2H-MAS."
    450     },
    451     {
    452       "title": "ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate",
    453       "authors": [
    454         "C.-M. Chan",
    455         "W. Chen",
    456         "Y. Su"
    457       ],
    458       "year": 2023,
    459       "arxiv_id": "2308.07201",
    460       "relevance": "Multi-agent debate framework for improving LLM output quality."
    461     },
    462     {
    463       "title": "GPT-4 Technical Report",
    464       "authors": [
    465         "J. Achiam",
    466         "S. Adler",
    467         "S. Agarwal"
    468       ],
    469       "year": 2023,
    470       "arxiv_id": "2303.08774",
    471       "relevance": "Core LLM capability paper relevant to AI code generation."
    472     },
    473     {
    474       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    475       "authors": [
    476         "J. S. Park",
    477         "J. O'Brien",
    478         "C. J. Cai"
    479       ],
    480       "year": 2023,
    481       "relevance": "Foundational work on generative multi-agent systems."
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 1,
    487       "justification": "Relevant only to the narrow intersection of FPGA designers working with MATLAB-to-HLS flows, not broadly applicable to most developers."
    488     },
    489     "surprise_contrarian": {
    490       "score": 1,
    491       "justification": "The finding that algorithm-level restructuring matters more than pragma tuning is known in the HLS community, though the magnitude (98% LUT reduction) is notable."
    492     },
    493     "fear_safety": {
    494       "score": 0,
    495       "justification": "No safety, security, or risk angle whatsoever."
    496     },
    497     "drama_conflict": {
    498       "score": 0,
    499       "justification": "No controversy, no challenges to specific companies or benchmarks, purely constructive contribution."
    500     },
    501     "demo_ability": {
    502       "score": 0,
    503       "justification": "No code, no demo, no reproducibility artifacts released; requires proprietary FPGA toolchains even conceptually."
    504     },
    505     "brand_recognition": {
    506       "score": 1,
    507       "justification": "From University of Technology Sydney, a recognized but not famous-in-tech institution; mentions Claude Code but is not from Anthropic."
    508     }
    509   }
    510 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs