scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22105B)
      1 {
      2   "paper": {
      3     "title": "ThreatLens: LLM-guided Threat Modeling and Test Plan Generation for Hardware Security Verification",
      4     "authors": [
      5       "Dipayan Saha",
      6       "Hasan Al Shaikh",
      7       "Shams Tarek",
      8       "Farimah Farahmandi"
      9     ],
     10     "year": 2025,
     11     "venue": "IEEE VLSI Test Symposium (VTS) 2025",
     12     "arxiv_id": "2505.06821",
     13     "doi": "10.48550/arXiv.2505.06821"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [],
     17   "methodology_tags": ["case-study"],
     18   "key_findings": "ThreatLens is a multi-agent LLM framework using GPT-4o and RAG for automated hardware security threat modeling and test plan generation. Evaluated on the NEORV32 RISC-V SoC, it automatically generated 854 unique security policies from ISA and specification documents. Two case studies show generated policies correspond to known real-world vulnerabilities (PMP access control and CSR access exceptions), but no systematic quantitative evaluation is provided.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL or code archive is provided anywhere in the paper. The framework is described but not released."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The security knowledge base of academic papers used for RAG is not released. The generated 854 security policies are not made available. The NEORV32 SoC is public but the ThreatLens-specific data is not."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions GPT-4o, LangChain, OpenAI embeddings, and FAISS but provides no version numbers, requirements file, or environment setup details sufficient to recreate the system."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No reproduction instructions are provided. The framework architecture is described at a high level but there are no step-by-step instructions to replicate the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No quantitative results with confidence intervals or error bars are reported. The paper presents only case studies and a count of generated policies (854)."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are used. The evaluation consists of two qualitative case studies with no comparative statistical analysis."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No effect sizes are reported. The paper claims the framework 'reduces the manual verification effort' but provides no quantitative comparison to establish magnitude of improvement."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The framework is tested on a single SoC (NEORV32) with no justification for why this one platform is sufficient to validate the approach."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance or repeated runs are reported. Results appear to be from a single execution of the framework."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines are included. There is no comparison against manual threat modeling, alternative automated tools, or simpler LLM-based approaches."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No baselines of any kind are included, so contemporaneity cannot be assessed."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The system has multiple components (RAG retrieval, multi-agent architecture, iterative user feedback, security policy extraction) but no ablation study tests the contribution of individual components."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No quantitative metrics are reported at all. The evaluation relies entirely on two qualitative case studies."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No systematic human evaluation of the generated policies or test plans is performed. The two case studies show individual policies are reasonable but there is no structured expert assessment of the 854 generated policies."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No held-out test set is used. The evaluation is based on two hand-picked case studies from the NEORV32 results."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No breakdown of results across threat categories, policy types, or other dimensions is provided. The 854 policies are reported as a single aggregate number."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No failure cases are discussed. Only two successful case studies are presented; no examples of incorrect, irrelevant, or missed policies are shown."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No negative results are reported. Every aspect of the evaluation presents the framework positively."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract claims the framework 'demonstrating its capability to automate security verification through structured test plans and validating its effectiveness in real-world scenarios.' The paper shows only two case studies on one SoC — this does not constitute validation of effectiveness. The claim of 'reducing manual verification effort' is unsupported by any quantitative comparison."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims the framework 'reduces the manual verification effort, enhances coverage, and ensures a structured, adaptable approach.' These are causal claims ('reduces', 'enhances') with no experimental design to establish causation — no before/after comparison, no control condition."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper is titled broadly as 'Hardware Security Verification' but tested only on a single RISC-V SoC (NEORV32). The conclusion acknowledges the need for 'enhancing adaptability to diverse hardware architectures' but the title and abstract do not bound claims to RISC-V or this specific SoC."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations are discussed. The paper does not consider whether simpler approaches (e.g., rule-based policy extraction, keyword matching) could achieve similar results, or whether the RAG and multi-agent architecture are necessary."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures policy generation count (854) and shows two policies match known vulnerabilities, but frames this as 'validating effectiveness' and 'ensuring robust security verification.' The gap between generating policies and actually improving security verification is not acknowledged."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper states 'we used GPT-4o as the LLM' (Section III) but provides no specific version, snapshot date, or API version."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Prompts are described in natural language (e.g., 'a structured LLM prompt along with the security knowledge,' 'a prompt with specific instructions') but the actual prompt text is never provided."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or other LLM API settings are mentioned."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The multi-agent framework is described in detail in Section II with workflow diagrams (Figures 1-4), four specialized agents with defined roles, RAG-based retrieval using LangChain/FAISS, iterative user feedback loops, and two distinct processing flows."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The RAG system's document chunking, embedding, and retrieval parameters are not specified. How the security knowledge base was curated, how specification documents were processed, and how duplicate policies were identified and removed are not documented."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated limitations section. The conclusion contains two sentences about limitations ('does not incorporate security asset extraction' and reliance on 'GPT-4o... closed-source model with high computational costs') but this is not substantive discussion."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed. The brief limitations in the conclusion are about future work directions, not about specific threats to the validity of the current results."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit scope boundaries are stated. The paper does not clarify what types of hardware, what threat classes, or what design scales the framework does NOT address."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw data is available. The 854 generated policies, the RAG knowledge base, the user interaction logs, and the full test plans are not released."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The security knowledge base is described as 'a knowledge base of academic papers that document state-of-the-art attack models' but no details about which papers, how many, or how they were selected are provided."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants in a research sense. The verification engineer interacts with the tool as a user in a demo, not as a study participant. The evaluation platform (NEORV32) is a standard public SoC."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The high-level pipeline (specification → RAG extraction → LLM analysis → policies) is described but specific filtering criteria, intermediate counts at each stage, and quality control steps are not documented. Only the final count of 854 policies is given."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Funding is disclosed: 'We thank to U.S. National Science Foundation (NSF) for their support through CAREER Award under Grant 2339971.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors are affiliated with the Department of Electrical and Computer Engineering, University of Florida, clearly stated in the paper header."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "NSF is an independent government funding agency with no financial stake in the outcome of this research."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper does not evaluate GPT-4o's capability on a benchmark. It uses GPT-4o as a component in a tool for policy extraction and threat identification, not to test model knowledge."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the paper tests a tool's outputs (generated policies and test plans), not a pre-trained model's performance on a benchmark."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Not applicable — no benchmark evaluation of model capabilities is performed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in a research study. The verification engineer role is part of the tool's interactive workflow, not a studied population."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects research conducted."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in a research study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in a research study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in a research study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in a research study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in a research study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, API costs, or latency information is reported despite using GPT-4o API calls through the multi-agent framework."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No compute budget, API spend, or hardware requirements are stated."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "ThreatLens automatically generated 854 unique security policies for the NEORV32 SoC from ISA and specification documents.",
    301       "evidence": "Section III states: 'the framework automatically generated 854 unique security policies, a task that would require significant time and effort if performed manually.'",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Generated policies correspond to known real-world hardware vulnerabilities (PMP access control violations).",
    306       "evidence": "Case Study 1 (Section III) shows a generated PMP policy matches a vulnerability studied at HACK@DAC [14] and reported in a real RISC-V SoC [15] where DMA bypassed PMP protections.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Generated policies identify CSR access control vulnerabilities found in real open-source RISC-V processors.",
    311       "evidence": "Case Study 2 (Section III) shows a generated CSR policy matches vulnerabilities reported in [16] and [17] in open-source RISC-V processors.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "ThreatLens reduces manual verification effort and enhances coverage.",
    316       "evidence": "Abstract and Section I claim this, but no quantitative comparison between manual and automated effort is provided. No coverage metric is defined or measured.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "No quantitative evaluation",
    323       "detail": "The entire evaluation consists of two hand-picked case studies from 854 generated policies. No metrics for precision, recall, coverage, or quality of the generated policies or test plans are provided."
    324     },
    325     {
    326       "flag": "No baselines or comparisons",
    327       "detail": "No comparison against manual threat modeling effort, alternative tools, simpler LLM approaches, or even a random policy generator. Impossible to assess whether the multi-agent RAG architecture adds value."
    328     },
    329     {
    330       "flag": "Claims outrun evidence",
    331       "detail": "The paper claims 'validating its effectiveness in real-world scenarios' and 'reduces manual verification effort' based on two case studies on one SoC with no quantitative evidence of time savings, coverage improvement, or quality."
    332     },
    333     {
    334       "flag": "Cherry-picked case studies",
    335       "detail": "Only 2 of 854 generated policies are examined. These are policies that happen to match known vulnerabilities — there is no assessment of how many policies are incorrect, redundant, or trivial."
    336     },
    337     {
    338       "flag": "Sample size of one",
    339       "detail": "The framework is evaluated on a single SoC (NEORV32). No evidence that it works on other architectures, designs of different complexity, or non-RISC-V platforms."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "LLM for SoC Security: A Paradigm Shift",
    345       "authors": ["Dipayan Saha", "Shams Tarek", "Katayoon Yahyaei", "Sujan Kumar Saha", "Jingbo Zhou", "Mark Tehranipoor", "Farimah Farahmandi"],
    346       "year": 2024,
    347       "relevance": "Directly examines LLM applications in hardware SoC security verification, covering vulnerability identification from design specifications."
    348     },
    349     {
    350       "title": "Empowering Hardware Security with LLM: The Development of a Vulnerable Hardware Database",
    351       "authors": ["Dipayan Saha", "Katayoon Yahyaei", "Sujan Kumar Saha", "Mark Tehranipoor", "Farimah Farahmandi"],
    352       "year": 2024,
    353       "relevance": "Uses LLMs to build a database of hardware vulnerabilities, relevant to AI-assisted security analysis."
    354     },
    355     {
    356       "title": "Assert-o: Context-based Assertion Optimization Using LLMs",
    357       "authors": ["S. S. Miftah", "A. Srivastava", "H. Kim", "K. Basu"],
    358       "year": 2024,
    359       "relevance": "Applies LLMs to optimize hardware security assertions, directly relevant to LLM-driven verification."
    360     },
    361     {
    362       "title": "NSPG: Natural Language Processing-based Security Property Generator for Hardware Security Assurance",
    363       "authors": ["X. Meng", "A. Srivastava", "A. Arunachalam", "A. Ray", "P. H. Silva", "R. Psiakis", "Y. Makris", "K. Basu"],
    364       "year": 2024,
    365       "relevance": "Uses NLP to generate hardware security properties from specifications, closely related to automated security policy generation."
    366     },
    367     {
    368       "title": "(Security) Assertions by Large Language Models",
    369       "authors": ["R. Kande", "H. Pearce", "B. Tan", "B. Dolan-Gavitt", "S. Thakur", "R. Karri", "J. Rajendran"],
    370       "year": 2024,
    371       "relevance": "Evaluates LLM capability for generating security assertions for hardware, relevant to LLM code generation and security."
    372     },
    373     {
    374       "title": "SoCureLLM: An LLM-driven Approach for Large-Scale System-on-Chip Security Verification and Policy Generation",
    375       "authors": ["Shams Tarek", "Dipayan Saha", "Sujan Kumar Saha", "Mark Tehranipoor", "Farimah Farahmandi"],
    376       "year": 2024,
    377       "relevance": "Closely related LLM-driven approach for SoC security verification and policy generation that ThreatLens builds upon."
    378     },
    379     {
    380       "title": "Exploring Automated Assertion Generation via Large Language Models",
    381       "authors": ["Q. Zhang", "W. Sun", "C. Fang", "B. Yu", "H. Li", "M. Yan", "J. Zhou", "Z. Chen"],
    382       "year": 2025,
    383       "doi": "10.1145/3699598",
    384       "relevance": "Explores LLM-based automated assertion generation targeting CWE vulnerabilities in SystemVerilog."
    385     }
    386   ],
    387   "engagement_factors": {
    388     "practical_relevance": {
    389       "score": 2,
    390       "justification": "Hardware security engineers could adopt this RAG+multi-agent approach for threat modeling, but no code is released so immediate use is not possible."
    391     },
    392     "surprise_contrarian": {
    393       "score": 0,
    394       "justification": "Straightforward application of LLMs and RAG to automate a manual process — confirms expectations about LLM utility rather than challenging them."
    395     },
    396     "fear_safety": {
    397       "score": 1,
    398       "justification": "Relates to hardware security verification but does not demonstrate novel attacks or raise new AI safety concerns."
    399     },
    400     "drama_conflict": {
    401       "score": 0,
    402       "justification": "No controversy or conflict; a standard systems paper proposing a new tool."
    403     },
    404     "demo_ability": {
    405       "score": 0,
    406       "justification": "No code, demo, or tool released; the framework exists only as described in the paper."
    407     },
    408     "brand_recognition": {
    409       "score": 0,
    410       "justification": "From the University of Florida ECE department; not a well-known AI lab or recognizable brand."
    411     }
    412   }
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs