scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18988B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Toward Hardware Security Benchmarking of LLMs",
      6     "authors": [
      7       "Raheel Afsharmazayejani",
      8       "Mohammad Moradi Shahmiri",
      9       "Parker Link",
     10       "Hammond Pearce",
     11       "Benjamin Tan"
     12     ],
     13     "year": 2024,
     14     "venue": "IEEE LLM Aided Design Workshop (LAD)",
     15     "arxiv_id": null,
     16     "doi": "10.1109/LAD62341.2024.10691745"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims (LLM promise, ability to generate HDL, need for security evaluation) are supported by content. Framework is presented and preliminary results shown. Claims appropriately qualified as 'promise' and 'initial work'.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Paper avoids strong causal claims. Observational findings on verbosity/redaction effects are presented as preliminary with appropriate caveats ('needs further investigation'). No unsupported causal mechanisms claimed.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Generalization explicitly bounded: 'initial work', 'preliminary evaluation', limited to 2 LLMs, 7 test scenarios, 375 experiments. Title uses 'Toward' indicating work-in-progress.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Paper acknowledges findings need investigation (e.g., 'context sensitivity' of GPT3.5) but does not explore alternative explanations. Main result (redaction decreases FPV) lacks discussion of confounds.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Paper explicitly distinguishes between proxy (assertion pass rate) and intended outcome (functional correctness): 'passing an assertion does not always prove that the designer's original intentions have been honoured.' Distinction clearly stated in Section V.B.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section V titled 'Discussion' with dedicated V.B 'Limitations and Opportunities' subsection listing specific constraints: limited sample, automation bottlenecks, manual verification, data contamination.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats identified: sample size (2 LLMs, 7 tests), manual post-processing bottleneck, manual functional verification (non-scalable, subjective), test data contamination (publicly available benchmarks likely in training data), assertion-intent mismatch.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Scope explicitly bounded throughout: 'initial foray', 'preliminary evaluation (constrained due to time and space limits)', 'much of our evaluation framework is yet to be fully automated'. No overgeneralization.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "NSERC funding disclosed (RGPIN-2022-03027). Intel gift disclosed with disclaimer ('does not constitute endorsement').",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Author affiliations stated (University of Calgary, UNSW) but no disclosure of whether authors have ties to OpenAI (GPT3.5) or Meta (CodeLlama). Potential conflicts unaddressed.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "NSERC is independent. Intel's independence questionable: disclaimer states 'does not constitute endorsement' but Intel involvement in hardware/AI accelerators creates undisclosed incentive structure. Weak independence claim.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patents, equity, or consulting arrangements declared. Minimal conflict disclosure.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Some terms defined (CWE, FPV, Verbosity) but core domain terms not: HDL, RTL, 'security' (discussed broadly but not precisely defined early). 'Hardware security' is the main contribution but demonstrated rather than defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contribution clearly stated: security-focused evaluation suite for LLM-generated HDL. Explicit list of three contributions (Section I). GitHub repo link provided for artifact access.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II identifies five specific gaps in prior work (lack of standardization, limited problems, training data overlap, academic focus, lack of security focus). Table I compares prior evaluations. Paper positions contribution against this gap.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "Paper states benchmarks inspired by TrustHub/CAD4Security/CWEs but does NOT argue why these specific test cases measure LLM security capability or represent real-world hardware threats. Construct validity not justified.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Paper uses code redaction size (1-15 lines) and verbosity (low/medium/high) as implicit difficulty proxies but provides no formal difficulty tier classification or measurement. Results show variance but no explicit difficulty characterization.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Paper does not systematically discuss ceiling/floor effects. CodeLlama shows clear floor (~0% pass) but no analysis of whether benchmarks discriminate effectively across model range.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline provided. Paper does not report human performance on these tasks, making it impossible to calibrate difficulty or validate that LLM scores are meaningful.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Scoring is binary (assertion pass/fail) but not justified. Paper acknowledges limitation: assertions don't capture functional correctness. No rationale for binary vs. partial-credit scoring.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No contamination resistance designed. Benchmarks sourced from publicly available TrustHub, CAD4Security, and hardware CWEs (likely in LLM training data). Paper acknowledges: 'As we cannot currently control our evaluation to compensate for that, evaluation results might not truly reflect model capabilities.'",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Temporal robustness mentioned ('necessitating evolving evaluation suites') and future leaderboard planned, but no concrete plan for anti-gaming measures or benchmark evolution strategy.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Failure modes identified: assertions don't prove intent (Section V.B), no fully automated security analysis tools, myriad dimensions to security not all addressed, functional verification is manual and subjective.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "GitHub repo provided but paper does not clarify what baseline implementations are included (e.g., scripts to run GPT3.5, parse results). Unclear if benchmark is immediately runnable by others.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Table II provides schema (ID, Plain text, Weakness/Vulnerability classification, Assertions, Tool, Design, Origin, Reference) and Appendix A details, but lacks data collection methodology (how designs selected?), preprocessing steps, and versioning strategy.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "GitHub repo URL provided but no explicit license (MIT, Apache, GPL) stated in paper. Access terms and reuse permissions not specified.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Intended use stated ('compare LLMs on secure HDL generation') but boundaries on conclusions absent. Paper does not specify what should NOT be concluded or which domains are out of scope.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "GPT3.5 generates syntactically correct HDL code >80% of the time",
    203       "evidence": "Figure 3 syntax check rates for GPT3.5 across 7 test scenarios; most tests show >80% pass rate.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Increased code redaction (more LLM-generated code) decreases security (FPV pass rate) for most designs",
    208       "evidence": "Figure 5 shows FPV decline with line count for REM031, REM041, REM042, REM043, REM051; excepted REM021, RMI041 (only 7 test cases).",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "Higher verbosity prompts do not consistently improve security performance",
    213       "evidence": "Figure 4 shows per-design variation: REM021 medium best, REM041 medium worst; no consistent verbosity advantage.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "CodeLlama fails to generate usable HDL code for this benchmark, with ~43% compilation failure rate",
    218       "evidence": "Section IV results: 'nearly all of the samples failed to compile'; distribution ~43% RTL code failing compilation, ~40% non-code, ~3% incomplete, ~14% compilable.",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Security assertions alone are insufficient to verify functional correctness of generated code",
    223       "evidence": "Section V.B: 'passing an assertion does not always prove that the designer's original intentions have been honoured'; assertions may pass while code is functionally incorrect.",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Hardware security evaluation of LLMs requires formal property verification with hand-crafted assertions",
    228       "evidence": "Section III testing methodology uses FPV with SystemVerilog assertions. Paper acknowledges 'no fully automated security analysis tools exist' for hardware.",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Publicly available benchmark sources are at risk of contamination in LLM training data",
    233       "evidence": "Section V.A: 'problem sets like HDLBits are freely available; thus will have been used in the training of many LLMs'; acknowledged for TrustHub, CAD4Security benchmarks.",
    234       "supported": "moderate"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "benchmark-creation",
    239     "benchmark-eval",
    240     "observational"
    241   ],
    242   "key_findings": "The paper proposes an initial security-focused benchmark suite for evaluating LLMs on hardware security, addressing a gap in prior work that emphasizes functionality over security. Preliminary evaluation on GPT3.5 and CodeLlama shows that GPT3.5 generates syntactically correct HDL (>80%) but security performance degrades with larger code redactions, while CodeLlama fails to produce compilable code in most cases (~43% failure). The work identifies critical open challenges: lack of standardized hardware security metrics, uncontrolled training data contamination (benchmarks sourced from publicly available databases), and difficulty distinguishing security violations from functional bugs. The benchmark suite is positioned as an in-progress, community-driven resource requiring significant future work on automation and temporal robustness.",
    243   "red_flags": [
    244     {
    245       "flag": "No human baseline",
    246       "detail": "Benchmark lacks human performance reference, making difficulty calibration impossible and benchmark validation uncertain."
    247     },
    248     {
    249       "flag": "Uncontrolled training data contamination",
    250       "detail": "Benchmark sources (TrustHub, CAD4Security, hardware CWEs) are publicly available and likely in LLM training data; results may not reflect true capability."
    251     },
    252     {
    253       "flag": "Manual functional verification bottleneck",
    254       "detail": "Functional correctness verified by subjective human expert judgment; non-scalable, introduces bias, acknowledged as open problem."
    255     },
    256     {
    257       "flag": "Very limited sample size",
    258       "detail": "Only 2 LLMs, 7 test scenarios, 375 total experiments per LLM; too small to support strong claims about LLM security capabilities."
    259     },
    260     {
    261       "flag": "Scoring rubric not justified",
    262       "detail": "Binary pass/fail on assertions acknowledged as insufficient ('does not always prove intent') but no alternative scoring method proposed."
    263     },
    264     {
    265       "flag": "Inconsistent verbosity effects unexplained",
    266       "detail": "Higher verbosity sometimes helps (REM021), sometimes hurts (REM041); no pattern or explanation provided; suggests weak or confounded signal."
    267     },
    268     {
    269       "flag": "Construct validity not established",
    270       "detail": "Why these specific CWE-derived test cases measure LLM security capability is not argued; representativeness to real hardware security risks unestablished."
    271     },
    272     {
    273       "flag": "Weak conflict-of-interest disclosure",
    274       "detail": "Intel funding disclosed with disclaimer but Intel's stakes in AI/hardware accelerators create undisclosed incentive structure; author affiliations with OpenAI/Meta not disclosed."
    275     }
    276   ],
    277   "cited_papers": [
    278     {
    279       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    280       "relevance": "Prior HDL code generation benchmark; directly comparable baseline for functional evaluation."
    281     },
    282     {
    283       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    284       "relevance": "Prior LLM HDL generation work; establishes capability baseline for code-generation tasks."
    285     },
    286     {
    287       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    288       "relevance": "LLM security evaluation in software domain; methodological parallel for applying security metrics to generated code."
    289     },
    290     {
    291       "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model",
    292       "relevance": "Prior RTL generation benchmark; functional-focused predecessor to this security-focused work."
    293     },
    294     {
    295       "title": "(Security) Assertions by Large Language Models",
    296       "relevance": "LLM assertion generation for security verification; methodologically related to this work's use of assertions."
    297     },
    298     {
    299       "title": "AssertLLM: Generating and Evaluating Hardware Verification Assertions from Design Specifications via Multi-LLMs",
    300       "relevance": "LLM-based assertion generation for hardware; directly relevant to security assertion approach."
    301     },
    302     {
    303       "title": "Machine Learning for Electronic Design Automation: A Survey",
    304       "relevance": "Establishes broader context of ML/LLM application to hardware design automation."
    305     },
    306     {
    307       "title": "Chip-Chat: Challenges and Opportunities in Conversational Hardware Design",
    308       "relevance": "Prior work on LLM-aided hardware design; identifies challenges in design automation that this work extends to security."
    309     }
    310   ],
    311   "engagement_factors": {
    312     "practical_relevance": {
    313       "score": 1,
    314       "justification": "Benchmark is preliminary and not yet integrated into real design workflows. CodeLlama results unusable; GPT3.5 results limited to academic test cases, not production design scenarios."
    315     },
    316     "surprise_contrarian": {
    317       "score": 2,
    318       "justification": "Finding that higher verbosity doesn't always improve performance is somewhat counterintuitive; otherwise expected that more context increases difficulty for code generation."
    319     },
    320     "fear_safety": {
    321       "score": 2,
    322       "justification": "Hardware security is critical (prevents side-channel attacks, key leakage) but framed as engineering problem, not AI safety/alignment concern. No AGI risk framing."
    323     },
    324     "demo_ability": {
    325       "score": 1,
    326       "justification": "GitHub repo provided but paper unclear on usability. Requires Cadence JasperGold (commercial, expensive FPV tool); manual assertion writing; not immediately runnable by practitioners."
    327     },
    328     "brand_recognition": {
    329       "score": 1,
    330       "justification": "Authors from University of Calgary and UNSW; no household-name institutions. Venue is IEEE LLM Aided Design Workshop (specialized, not flagship conference)."
    331     },
    332     "drama_conflict": {
    333       "score": 1,
    334       "justification": "Straightforward methodological work with no controversial claims. Intel funding raises weak conflict-of-interest signal but not discussed. No drama or conflict angle."
    335     }
    336   },
    337   "hn_data": {
    338     "threads": [],
    339     "top_points": 0,
    340     "total_points": 0,
    341     "total_comments": 0
    342   }
    343 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs