scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19737B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A2H-MAS: An Algorithm-to-HLS Multi-Agent System for Automated and Reliable FPGA Implementation",
      6     "authors": [
      7       "Jie Lei",
      8       "Ruofan Jia",
      9       "J. Andrew Zhang",
     10       "Hao Zhang"
     11     ],
     12     "year": 2025,
     13     "venue": "Unknown",
     14     "arxiv_id": "2508.10904",
     15     "doi": "10.48550/arXiv.2508.10904"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Core claims (functionally correct, resource-efficient, latency-optimized designs) are supported by Table I–II results with measured LUT, DSP, BRAM, frequency, and latency metrics for both 5G NR and WLAN implementations.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Claim that 'algorithm choice has larger effect than pragma tuning' is justified by Table II ablation: Adaptation stage reduces calcThreshold LUTs from 36,500→685 (50×) and extractSSBsig 4,468→275 (16×), demonstrating algorithmic transformation impact.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Paper claims 'demonstrates effectiveness and robustness for complex hardware development workflows' based on only 2 systems (5G NR, WLAN). Conclusion acknowledges 'future work' to support broader domains, contradicting broad generalization claims.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Paper explains design choices (why multi-agent > single agent, why algorithm-aware > pragma-only) but does not discuss alternative interpretations of experimental results or competing explanations for performance improvements.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Claims ('resource-efficient', 'latency-optimized', 'functionally correct') directly match measured outcomes (LUTs/DSP/BRAM, frequency/latency, pass/fail verification). No proxy–measurement mismatch.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations or threats-to-validity section. Conclusion discusses future extensions (broader domains, richer feedback) but not current methodological limitations or failure modes.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No discussion of specific threats: only 2 applications tested (no justification for sample size), reliance on proprietary Claude API (reproducibility risk), no comparison against published competing systems (HLSPilot, VeriMind, HDLAgent).",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Paper mentions 'current implementation focuses on synchronization stage' for WLAN but does not state what algorithm types, hardware targets, or problem sizes the system does NOT handle well.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding statement provided in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations clearly stated: University of Technology Sydney and Xidian University.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funder mentioned.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No statement of competing interests or financial disclosures (patents, equity, consulting relationships with Anthropic or FPGA vendors).",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms adequately defined in context: HLS, dataflow decomposition, algorithm–hardware co-design explained; standard acronyms (FPGA, DSP, BRAM) assumed for audience.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contributions explicitly stated (Section I): (1) A2H-MAS framework for MATLAB→HLS conversion, (2) algorithm–hardware co-design methodology, (3) empirical validation on wireless algorithms.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Related work (Section II) engages with VerilogEval, MG-Verilog, VGen, VeriMind, HLSPilot, HDLAgent, ChatDev, MetaGPT; contrasts fine-tuning vs zero-shot, single agent vs multi-agent approaches.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No source code, prompts, or generated HLS implementations are released. Paper describes system but provides no reproducible artifacts.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "MATLAB algorithms (5G NR, WLAN) are reference standards, not novel datasets. Generated test data and results are not released.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Mentions 'Xilinx Vitis HLS, MATLAB, RFNoC, Claude Code' without version numbers, dependency specifications, Docker images, or requirements files.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Workflow described in Section IV but no step-by-step instructions for independent reproduction. Relies on proprietary system not available to readers.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Tables I–II show single-run results with no confidence intervals, error bars, variance/std dev, significance tests, or sample-size justification. No multiple independent runs reported.",
    149         "source": "haiku"
    150       },
    151       "evaluation_design": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Baselines included (Direct vs Adaptation vs Refinement) but baseline is naive strawman, not comparison with published methods (HLSPilot, VeriMind, HDLAgent mentioned in related work but not empirically compared). No discussion of failure cases beyond one timing closure failure. No statistical significance testing despite comparative claims.",
    155         "source": "haiku"
    156       },
    157       "setup_transparency": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "LLM model version unspecified ('Claude Code' used without identifying Claude 3.5 Sonnet vs Opus; no training cutoff). Figures 2–3 show example prompts but actual prompts used in experiments not fully provided. No temperature, top-p, or other hyperparameters reported.",
    161         "source": "haiku"
    162       },
    163       "data_integrity": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Test data generation clearly documented (Phase II: execute original algorithm, record intermediate variables as I/O pairs). Data pipeline transparent (Phases I–VIII). Reference standards (5G NR, WLAN) are well-known, mitigating data integrity concerns.",
    167         "source": "haiku"
    168       },
    169       "contamination": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "Not evaluating LLM capabilities on pre-training benchmarks; evaluating domain-specific task execution. No benchmark contamination risk.",
    173         "source": "haiku"
    174       },
    175       "human_studies": {
    176         "applies": false,
    177         "answer": false,
    178         "justification": "No human participants, so all human_studies questions are N/A.",
    179         "source": "haiku"
    180       },
    181       "cost_and_practicality": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No API cost, latency, or total compute budget reported for running A2H-MAS pipeline. Practical deployment cost unknown.",
    185         "source": "haiku"
    186       }
    187     }
    188   },
    189   "claims": [
    190     {
    191       "claim": "A2H-MAS produces functionally correct and hardware-efficient HLS code from MATLAB algorithms",
    192       "evidence": "Table I shows synthesis results (LUT, DSP, BRAM, frequency) for 5G NR and WLAN implementations; each phase includes functional verification pass/fail.",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "Algorithmic transformation (Adaptation phase) has order-of-magnitude larger impact on resource efficiency than pragma-level optimization (Refinement phase)",
    197       "evidence": "Table II: calcThreshold LUT reduction 36,500→685 (53×) via Adaptation, then 685→173 (4×) via Refinement; extractSSBsig 4,468→275 (16×) via Adaptation, then 275→155 (1.8×) via Refinement.",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "Multi-agent architecture with standardized interfaces reduces hallucinations and improves reliability compared to single-agent LLM translation",
    202       "evidence": "Implicit in design (Fig. 1 contrasts single agent with proposed system); Table II Direct method fails timing closure while proposed methods succeed, but no direct hallucination/error-rate comparison provided.",
    203       "supported": "moderate"
    204     },
    205     {
    206       "claim": "Modular dataflow decomposition enables scalable, parallel execution of algorithm-to-hardware translation",
    207       "evidence": "Section III–IV describes decomposition strategy and phase dependencies, but no empirical data on scalability, parallelism speedup, or failure modes with large algorithms.",
    208       "supported": "weak"
    209     },
    210     {
    211       "claim": "Standardized agent input–output interfaces minimize coupling and enable seamless pipeline integration",
    212       "evidence": "Figure 2 shows interface specification (module_name, function_signature, framework_integration), but no measurement of coupling or empirical comparison against non-standardized alternative.",
    213       "supported": "moderate"
    214     }
    215   ],
    216   "methodology_tags": [
    217     "case-study",
    218     "benchmark-eval"
    219   ],
    220   "key_findings": "A2H-MAS, a modular multi-agent system, automates end-to-end conversion of MATLAB algorithms to FPGA-ready HLS code. Algorithm-level optimization (dataflow restructuring, streaming patterns) yields 10–50× resource reductions, far larger than pragma-level tuning. Systematic dataflow decomposition, deterministic tool validation (MATLAB batch execution, HLS C-sim, RTL co-simulation), and explicit workflow phases reduce LLM hallucinations. On two wireless communication benchmarks (5G NR SSB detection, WLAN synchronization), the system achieves functional correctness, meets latency constraints (292–337 MHz), and produces efficient hardware with moderate resource footprint.",
    221   "red_flags": [
    222     {
    223       "flag": "No comparison with published methods",
    224       "detail": "Only compares against a naive Direct baseline, not against published competing systems (HLSPilot, VeriMind, HDLAgent) mentioned in related work. Claims relative effectiveness cannot be verified."
    225     },
    226     {
    227       "flag": "Severely limited evaluation scope",
    228       "detail": "Only 2 end-to-end applications tested; 2 modules ablated. No justification for sample size. Generalization claims ('complex hardware development workflows') unsupported by evidence."
    229     },
    230     {
    231       "flag": "Single-run results with no variance",
    232       "detail": "Tables I–II report single-run measurements with no error bars, confidence intervals, or multiple independent runs. Reliability of results unknown."
    233     },
    234     {
    235       "flag": "LLM hallucination claims are qualitative, not quantitative",
    236       "detail": "Paper claims system reduces hallucinations and improves reliability, but provides no error-rate metrics, direct LLM-vs-system comparison, or quantitative reliability measure."
    237     },
    238     {
    239       "flag": "No reproducibility; proprietary system dependency",
    240       "detail": "Code and data not released. System relies entirely on proprietary Claude API (model version unspecified). Readers cannot independently reproduce or verify results."
    241     },
    242     {
    243       "flag": "Minimal failure mode analysis",
    244       "detail": "Beyond one timing closure failure (Direct strategy), no discussion of when/why system fails. No analysis of challenging algorithm types or edge cases."
    245     },
    246     {
    247       "flag": "Model version and hyperparameters unspecified",
    248       "detail": "Paper mentions 'Claude Code' without identifying model variant (Sonnet vs Opus). No temperature, top-p, or LLM inference parameters reported. Example prompts shown but not actual experimental prompts."
    249     },
    250     {
    251       "flag": "No statistical significance testing despite comparative claims",
    252       "detail": "Ablation study (Table II) shows improvements (Direct→Adaptation→Refinement) but no t-tests, confidence intervals, or significance thresholds reported."
    253     }
    254   ],
    255   "cited_papers": [
    256     {
    257       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    258       "relevance": "Benchmark and evaluation protocol for LLM-based HDL generation; establishes that SOTA models (Claude, ChatGPT, Gemini) outperform fine-tuned smaller models."
    259     },
    260     {
    261       "title": "HLSPilot: LLM-Based High-Level Synthesis",
    262       "relevance": "Directly competing agent-based framework for MATLAB/C to HLS translation; represents state-of-the-art in the problem domain."
    263     },
    264     {
    265       "title": "VeriMind: Agentic LLM for Automated Verilog Generation with a Novel Evaluation Metric",
    266       "relevance": "Multi-agent framework for hardware design with verification integration; parallel approach to distributing verification tasks among specialized agents."
    267     },
    268     {
    269       "title": "ChatDev: Communicative Agents for Software Development",
    270       "relevance": "Foundational LLM-based multi-agent system architecture demonstrating role allocation, communication, and collaborative code generation."
    271     },
    272     {
    273       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    274       "relevance": "Multi-agent coordination framework with structured workflows; abstract reasoning about task decomposition and agent specialization."
    275     },
    276     {
    277       "title": "AutoChip: Automating HDL Generation Using LLM Feedback",
    278       "relevance": "Iterative refinement loop for LLM-generated HDL; demonstrates integration of compiler feedback for hardware design."
    279     },
    280     {
    281       "title": "MG-Verilog: Multi-Grained Dataset towards Enhanced LLM-Assisted Verilog Generation",
    282       "relevance": "Domain-specific benchmark and curated dataset for HDL generation; represents data-centric approach to improving LLM performance on hardware tasks."
    283     }
    284   ],
    285   "engagement_factors": {
    286     "practical_relevance": {
    287       "score": 2,
    288       "justification": "Potentially useful for hardware engineers automating MATLAB→FPGA workflows, but severely limited by closed-source implementation and lack of reproducible artifacts."
    289     },
    290     "surprise_contrarian": {
    291       "score": 1,
    292       "justification": "Finding that algorithm-level optimization outweighs pragma tuning is intuitive for hardware designers; not surprising, though useful confirmation."
    293     },
    294     "fear_safety": {
    295       "score": 0,
    296       "justification": "No AI safety or robustness concerns raised; focus is on engineering automation."
    297     },
    298     "drama_conflict": {
    299       "score": 0,
    300       "justification": "No controversy or conflict narrative; technical contribution only."
    301     },
    302     "demo_ability": {
    303       "score": 0,
    304       "justification": "System cannot be tried without access to proprietary implementation and Claude API. No demo code or live interface provided."
    305     },
    306     "brand_recognition": {
    307       "score": 1,
    308       "justification": "Uses Claude (Anthropic), but authors are from UTS and Xidian University (not flagship institutions). Limited pre-existing audience recognition."
    309     }
    310   },
    311   "hn_data": {
    312     "threads": [
    313       {
    314         "hn_id": "29279146",
    315         "title": "Crypto Wash Trading",
    316         "points": 572,
    317         "comments": 299,
    318         "url": "https://news.ycombinator.com/item?id=29279146",
    319         "created_at": "2021-11-19T16:44:26Z"
    320       },
    321       {
    322         "hn_id": "44271284",
    323         "title": "Self-Adapting Language Models",
    324         "points": 246,
    325         "comments": 73,
    326         "url": "https://news.ycombinator.com/item?id=44271284",
    327         "created_at": "2025-06-13T19:03:42Z"
    328       },
    329       {
    330         "hn_id": "41306555",
    331         "title": "Exploring Impact of Code in Pre-Training",
    332         "points": 5,
    333         "comments": 2,
    334         "url": "https://news.ycombinator.com/item?id=41306555",
    335         "created_at": "2024-08-21T03:38:33Z"
    336       },
    337       {
    338         "hn_id": "44443760",
    339         "title": "Your Language Model Can Handle Non-Canonical Tokenizations",
    340         "points": 2,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=44443760",
    343         "created_at": "2025-07-02T13:53:44Z"
    344       },
    345       {
    346         "hn_id": "41745068",
    347         "title": "Pre-training with code improves performance on NL reasoning",
    348         "points": 2,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=41745068",
    351         "created_at": "2024-10-04T20:02:19Z"
    352       },
    353       {
    354         "hn_id": "44116793",
    355         "title": "When Models Don't Collapse: On the Consistency of Iterative MLE",
    356         "points": 1,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=44116793",
    359         "created_at": "2025-05-28T15:06:51Z"
    360       },
    361       {
    362         "hn_id": "43503479",
    363         "title": "The Quantum Technology Job Market: A Quantitative Investigation",
    364         "points": 1,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=43503479",
    367         "created_at": "2025-03-28T10:05:27Z"
    368       },
    369       {
    370         "hn_id": "42884637",
    371         "title": "Player Performance and Skill Rating in Esports [pdf]",
    372         "points": 1,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=42884637",
    375         "created_at": "2025-01-31T04:14:07Z"
    376       },
    377       {
    378         "hn_id": "41367147",
    379         "title": "Kotlin's Type System Is (Also) Unsound",
    380         "points": 1,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=41367147",
    383         "created_at": "2024-08-27T13:11:45Z"
    384       },
    385       {
    386         "hn_id": "41318909",
    387         "title": "To Code, or Not to Code? Exploring Impact of Code in Pre-Training",
    388         "points": 1,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=41318909",
    391         "created_at": "2024-08-22T11:09:37Z"
    392       }
    393     ],
    394     "top_points": 572,
    395     "total_points": 832,
    396     "total_comments": 374
    397   }
    398 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs