scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28085B)
      1 {
      2   "paper": {
      3     "title": "Toward Hardware Security Benchmarking of LLMs",
      4     "authors": [
      5       "Raheel Afsharmazayejani",
      6       "Mohammad Moradi Shahmiri",
      7       "Parker Link",
      8       "Hammond Pearce",
      9       "Benjamin Tan"
     10     ],
     11     "year": 2024,
     12     "venue": "IEEE LLM Aided Design Workshop (LAD)",
     13     "doi": "10.1109/LAD62341.2024.10691745"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "This paper proposes a security-focused evaluation suite for LLM-generated HDL code, with preliminary tests on GPT-3.5 and CodeLlama across 7 benchmark scenarios. GPT-3.5 produces syntactically correct code (>80% syntax pass rate) but security-correct code at much lower rates, with increased prompt verbosity generally helping and increased code redaction hurting FPV pass rates. CodeLlama performed substantially worse, with most responses failing to compile (~43% compilation failure, ~40% non-code responses). The work highlights a critical gap in security-centric evaluation of LLM-generated hardware designs.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The benchmark suite is released at https://github.com/CalgaryISH/llm-hwsec-benchmarking, explicitly stated in Section I: 'we make our benchmark suite available at [URL] for issues, discussions, and pull requests.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The benchmark suite including test scenarios, prompts, and security assertions is released at the GitHub repository. The benchmark designs sourced from TrustHub are publicly available."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, dependency lists, or software version requirements are provided. They mention using Cadence JasperGold for FPV but provide no setup details."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub release is mentioned for contributions but no README or reproduction guide is described."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as raw counts out of 5 runs (Tables IV-X) and percentages (Figs. 3-5) with no confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper compares GPT-3.5 vs CodeLlama and different verbosity levels without any statistical tests. Claims like 'CodeLlama's overall performance is worse than GPT3.5' rest on raw number comparisons."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Only raw pass counts out of 5 and percentages are reported. No effect sizes, relative improvements, or contextualized magnitude measures are provided."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Each test is run 5 times (375 experiments per model) but no justification for why 5 runs was chosen. No power analysis or discussion of whether 5 runs is sufficient to detect meaningful differences."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Results show individual pass counts out of 5 in tables but no variance, standard deviation, or spread measures across runs are reported."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Two LLMs are compared (GPT-3.5 and CodeLlama), and Table I compares their evaluation suite against prior work. The models serve as mutual baselines."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "GPT-3.5 was already superseded by GPT-4 when this paper was written (2024). CodeLlama is based on Llama2, not Llama3. Neither model represents the state of the art for the evaluation year."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "While verbosity levels and code redaction lengths are varied (Figs. 3-5), these are parameter sweeps of the benchmark, not systematic ablations removing individual components of a proposed system."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Two distinct metrics are used: syntax check pass rate (Fig. 3) and Formal Property Verification (FPV) pass rate (Fig. 4). Response type classification is also reported for CodeLlama."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section V-A states: 'we assessed functional correctness using plausibility as a proxy; a human with domain knowledge reads the code and judges whether it would plausibly provide the intended function.' Human evaluation of LLM outputs was performed, though acknowledged as limited."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No separation of dev and test sets is described. All benchmark scenarios are used for the evaluation without any held-out partition."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Detailed per-benchmark breakdowns are provided: individual results for all 7 scenarios (REM021-RMI041) in Figs. 3-5 and Tables IV-X, broken down by verbosity level and code block size."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "CodeLlama's compilation failures are extensively analyzed (~43% compilation failure, ~40% non-code, ~3% incomplete). GPT-3.5's declining FPV with increased redaction is discussed. Section V discusses multiple failure modes."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "CodeLlama's near-complete inability to produce compilable HDL is a significant negative result. Low-verbosity prompts never achieving highest FPV pass rates is reported. Medium verbosity sometimes performing worst (REM041) is noted."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims are modest: 'proposes an approach for evaluating the security,' 'new insights into the challenges,' and 'illustrative preliminary use.' All are supported by the paper's content in Sections III-V."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper's causal claims are modest and based on controlled manipulation. 'Increased redaction results in decreased FPV' (Section IV) comes from systematically varying a single parameter (code block size) while holding others constant, which is adequate for the limited claim."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper consistently uses hedging language: 'preliminary,' 'illustrative,' 'initial foray,' 'initial set of test scenarios.' The title itself says 'Toward.' Claims are bounded to the two tested LLMs and the specific benchmark subset."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section V discusses data contamination as an alternative explanation, notes that passing assertions doesn't prove functional correctness, and acknowledges that prompt wording could explain results (Section V-B 'Diverse prompting')."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section V-B 'Assertions' explicitly acknowledges the proxy gap: 'passing an assertion does not always prove that the designer's original intentions have been honoured' and discusses how assertions target individual vulnerabilities without guaranteeing overall security."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper says 'GPT3.5' and 'CodeLlama' without specific version identifiers. No snapshot date, API version, or model size is specified for either model. CodeLlama is described only as 'a fine-tuned version of Llama2.'"
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Actual prompt text is provided for all three verbosity levels in Figs. 2 and 6. A code snippet example is shown in Fig. 7. The benchmark suite with full prompts is released on GitHub."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "For GPT-3.5: 'the default temperature for the web interface was used' without stating the actual value. For CodeLlama: 'temperature of 0.8.' Other parameters (top-p, max tokens) are not mentioned for either model."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. LLMs are prompted directly with code completion tasks."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section III describes how benchmark scenarios were constructed: designs sourced from TrustHub and CWEs, code blocks designated for removal at varying lengths, three verbosity levels created per scenario. The process from design selection to prompt construction is documented."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section V-B is titled 'Limitations and Opportunities' with substantial discussion covering limited sample size, automation bottlenecks, diverse prompting, test data contamination, assertion limitations, and security evaluation scope."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats are discussed: data contamination from publicly available TrustHub circuits, limited sample size constraining granularity, manual post-processing bottleneck, and that assertion passing doesn't guarantee functional correctness. These are specific to this study."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Clear scope boundaries: only two LLMs evaluated, only TrustHub-sourced benchmarks used in preliminary results, evaluation limited to FPV-based security assessment, functional correctness only manually checked. Section V explicitly lists what remains as open challenges."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The benchmark suite is released but the actual raw LLM responses from the 750 experiments (375 per model) are not made available. Only aggregated pass counts (Tables IV-X) are reported."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section IV describes the collection process: each scenario queried at 3 verbosity levels across multiple code block variants, each run 5 times, using GPT-3.5's web interface and CodeLlama at temperature 0.8. FPV performed using Cadence JasperGold."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Benchmark scenarios were author-constructed from existing security benchmarks (TrustHub, CWEs), not sampled from a population."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The pipeline steps are described (prompt → LLM response → syntax check → FPV) but exact counts at each filtering stage are not consistently reported. For GPT-3.5, it is unclear how many designs passed syntax check before FPV evaluation. CodeLlama's filtering is better documented (~43% compilation failure, ~40% non-code, ~3% incomplete, ~14% compiled)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Funding is disclosed: 'supported in part by the Natural Sciences and Engineering Research Council of Canada (NSERC) [RGPIN-2022-03027]' and 'partly supported by a gift from Intel Corporation.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations clearly listed: University of Calgary (Canada) and University of New South Wales (Australia). Authors are not affiliated with OpenAI or Meta (makers of the evaluated LLMs)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "NSERC is a government funding agency with no stake in LLM performance. Intel's gift is disclosed with the note 'This work does not in any way constitute an Intel endorsement of a product/supplier.' Neither funder has a direct interest in the outcome of GPT-3.5 or CodeLlama evaluations."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper. The Intel gift is disclosed but there is no explicit declaration of whether authors hold patents or equity related to the findings."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for either GPT-3.5 or CodeLlama. Without cutoff dates, it is impossible to assess whether TrustHub benchmarks were in the training data."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section II point 3 explicitly states 'LLMs are trained on solutions: Problem sets like HDLBits are freely available; thus will have been used in the training of many LLMs evaluated.' Section V-B also discusses contamination risk for their benchmark."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Contamination is discussed as a specific limitation in Section V-B: 'it is safe to assume that any representative, real-world benchmark is at risk for cross-contamination between training and test data, including the current work.' Also flagged as a desideratum in Section II."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. The evaluation is entirely automated (LLM prompting, syntax checking, FPV)."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study evaluates LLM-generated code against automated security assertions."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, API cost, or per-experiment cost is reported for either GPT-3.5 or CodeLlama despite running 750 total experiments."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget is stated. No mention of GPU hours for CodeLlama, API costs for GPT-3.5, or JasperGold FPV compute time."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Each test is run 5 times but results are reported as raw pass counts, not as mean/std across runs. No explicit seed sensitivity analysis is conducted."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Appendix C (Table III) clearly states: 'Each scenario is repeated five times, adding up to 375 experiments run on each evaluated model.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search was conducted. Temperature was set to default for GPT-3.5 and 0.8 for CodeLlama without justification or exploration of other values."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The choice of temperature 0.8 for CodeLlama is not justified. GPT-3.5's 'default temperature' is used without explanation. No validation set or selection criterion is described for these choices."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors designed the benchmark, crafted the assertions, and evaluated the LLMs without acknowledging potential bias in their assertion design or evaluation process."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "GPT-3.5 (commercial API) and CodeLlama (open-source, locally run) have very different compute profiles, but no discussion of compute budget differences is provided."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section V-B 'Assertions' discusses construct validity: 'passing an assertion does not always prove that the designer's original intentions have been honoured' and notes that security assertions target individual vulnerabilities without guaranteeing overall design security."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. LLMs are prompted directly without any agentic framework or tooling."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "While contamination is discussed generally, no specific temporal analysis is performed. The paper does not state when TrustHub benchmarks were published relative to GPT-3.5 or CodeLlama training cutoffs."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The medium and high verbosity prompts explicitly describe the vulnerability to be fixed, effectively providing the answer. This information leakage is designed as an experimental variable but not discussed as a potential confound for security evaluation claims."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the benchmark scenarios (sourced from TrustHub and CWEs) share structural similarities with each other or with LLM training data examples."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection or prevention method is applied. Contamination is only discussed as a conceptual concern without any detection analysis."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "GPT-3.5 produces syntactically correct HDL code in most cases, with >80% syntax check pass rate for most benchmarks",
    370       "evidence": "Fig. 3 shows syntax check rates across 7 benchmark scenarios at 3 verbosity levels, with most exceeding 80% (Section IV)",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Increased code redaction generally leads to decreased FPV (security) pass rates for GPT-3.5",
    375       "evidence": "Fig. 5 shows FPV pass rates by number of removed lines. 'Except for REM021 and RMI041, increased redaction results in decreased FPV' (Section IV)",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Low-verbosity prompts never achieve the highest FPV pass rate across any benchmark",
    380       "evidence": "Section IV: 'In none of the benchmarks a low-verbosity prompt provides the highest FPV pass rate.' Supported by Fig. 4 data across all 7 benchmarks.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "CodeLlama's overall performance is substantially worse than GPT-3.5 on the evaluation suite",
    385       "evidence": "Section IV: Out of 375 CodeLlama experiments, ~43% compilation failure, ~40% non-code responses, ~3% incomplete, ~14% compiled. Nearly all compiled samples failed to produce functionally correct code.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Security-centric evaluation of LLM-generated HDL code is a significant gap in the literature",
    390       "evidence": "Table I shows only 4 prior works with security-related evaluations, most with small test sets. Section II catalogs 5 specific weaknesses in existing evaluations.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Very small preliminary evaluation",
    397       "detail": "Only 2 LLMs evaluated on 7 benchmark scenarios with 5 runs each. One of the two models (CodeLlama) produced almost no usable results, leaving effectively a single-model evaluation."
    398     },
    399     {
    400       "flag": "No statistical analysis",
    401       "detail": "All comparisons are based on raw counts out of 5 without any statistical tests, confidence intervals, or variance reporting. With n=5 per condition, differences between conditions may be entirely due to sampling noise."
    402     },
    403     {
    404       "flag": "Outdated model selection",
    405       "detail": "GPT-3.5 was already superseded by GPT-4 at publication time (2024). CodeLlama is based on Llama2 rather than more recent models. Results may not reflect current LLM capabilities."
    406     },
    407     {
    408       "flag": "Functional correctness not rigorously verified",
    409       "detail": "Functional correctness was assessed by human 'plausibility' judgment, which the authors themselves acknowledge is subjective and unscalable. Security assertions can pass on code that is not functionally correct."
    410     },
    411     {
    412       "flag": "Assertion quality varies by design",
    413       "detail": "The authors acknowledge 'the availability and quality of the assertions in our work vary considerably, making it difficult to do a balanced comparison' (Section V-A). This undermines cross-benchmark comparisons."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    419       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    420       "year": 2022,
    421       "relevance": "Foundational study on security vulnerabilities in LLM-generated code, finding 40% of generated code contains vulnerabilities."
    422     },
    423     {
    424       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    425       "authors": ["S. Thakur", "B. Ahmad", "H. Pearce", "B. Tan", "B. Dolan-Gavitt", "R. Karri", "S. Garg"],
    426       "year": 2023,
    427       "arxiv_id": "2308.00708",
    428       "relevance": "LLM benchmark for Verilog code generation using HDLBits as foundation; demonstrates LLM capability and limitations for HDL."
    429     },
    430     {
    431       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    432       "authors": ["M. Liu", "N. Pinckney", "B. Khailany", "H. Ren"],
    433       "year": 2023,
    434       "arxiv_id": "2309.07544",
    435       "relevance": "Benchmark evaluation of LLMs for Verilog code generation with 156 test scenarios."
    436     },
    437     {
    438       "title": "(Security) Assertions by Large Language Models",
    439       "authors": ["R. Kande", "H. Pearce", "B. Tan", "B. Dolan-Gavitt", "S. Thakur", "R. Karri", "J. Rajendran"],
    440       "year": 2024,
    441       "relevance": "Evaluates LLM capability in generating hardware security assertions, directly related to security evaluation of LLM-generated designs."
    442     },
    443     {
    444       "title": "On Hardware Security Bug Code Fixes By Prompting Large Language Models",
    445       "authors": ["B. Ahmad", "S. Thakur", "B. Tan", "R. Karri", "H. Pearce"],
    446       "year": 2024,
    447       "relevance": "Explores LLM prompting for fixing hardware security bugs, related to LLM capability in security-critical code generation."
    448     },
    449     {
    450       "title": "AutoChip: Automating HDL Generation Using LLM Feedback",
    451       "authors": ["S. Thakur", "J. Blocklove", "H. Pearce", "B. Tan", "S. Garg", "R. Karri"],
    452       "year": 2023,
    453       "arxiv_id": "2311.04887",
    454       "relevance": "LLM-based automated HDL generation with feedback loops, demonstrating agentic approaches to hardware design."
    455     },
    456     {
    457       "title": "Purple Llama CyberSecEval: A Secure Coding Benchmark for Language Models",
    458       "authors": ["M. Bhatt", "S. Chennabasappa", "C. Nikolaidis"],
    459       "year": 2023,
    460       "arxiv_id": "2312.04724",
    461       "relevance": "Security-focused benchmark for LLM-generated software code using automated insecure code detectors."
    462     },
    463     {
    464       "title": "All Artificial, Less Intelligence: GenAI through the Lens of Formal Verification",
    465       "authors": ["D. N. Gadde", "A. Kumar", "T. Nalapat", "E. Rezunov", "F. Cappellini"],
    466       "year": 2024,
    467       "arxiv_id": "2403.16750",
    468       "relevance": "Evaluates GenAI-generated hardware through formal verification with 30 security-related tests."
    469     },
    470     {
    471       "title": "Code Llama: Open Foundation Models for Code",
    472       "authors": ["B. Rozière", "J. Gehring", "F. Gloeckle"],
    473       "year": 2024,
    474       "arxiv_id": "2308.12950",
    475       "relevance": "Open-source code generation model evaluated in this study; key baseline for code LLM capability."
    476     },
    477     {
    478       "title": "AssertLLM: Generating and Evaluating Hardware Verification Assertions from Design Specifications via Multi-LLMs",
    479       "authors": ["W. Fang", "M. Li", "M. Li", "Z. Yan", "S. Liu", "H. Zhang", "Z. Xie"],
    480       "year": 2024,
    481       "arxiv_id": "2402.00386",
    482       "relevance": "Multi-LLM approach to generating hardware verification assertions from design specifications."
    483     },
    484     {
    485       "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models",
    486       "authors": ["S. Golchin", "M. Surdeanu"],
    487       "year": 2023,
    488       "relevance": "Methods for detecting data contamination in LLMs, directly relevant to benchmark validity concerns."
    489     }
    490   ]
    491 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs