ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26037B)


      1 {
      2   "paper": {
      3     "title": "Large Language Model for Verilog Code Generation: Literature Review and the Road Ahead",
      4     "authors": [
      5       "Guang Yang",
      6       "Wei Zheng",
      7       "Dong Liang",
      8       "Peng Hu",
      9       "Yukui Yang",
     10       "Shaohang Peng",
     11       "Zhenghan Li",
     12       "Jiahui Feng",
     13       "Xiao Wei",
     14       "Kexin Sun",
     15       "Deyuan Ma",
     16       "Haotian Cheng",
     17       "Yiheng Shen",
     18       "Xiang Chen",
     19       "Xing Hu",
     20       "Terry Yue Zhuo",
     21       "David Lo"
     22     ],
     23     "year": 2025,
     24     "venue": "ACM Computing Surveys",
     25     "arxiv_id": "2512.00020",
     26     "doi": "10.48550/arXiv.2512.00020"
     27   },
     28   "scan_version": 3,
     29   "active_modules": ["survey_methodology"],
     30   "methodology_tags": ["meta-analysis"],
     31   "key_findings": "This systematic literature review analyzes 102 papers (70 peer-reviewed, 32 preprints) on LLM-based Verilog code generation from 2020-2025, documenting exponential growth from 1 paper in 2020 to 66 in 2025. The field has converged on code-specialized foundation models (DeepSeek-Coder, Qwen2.5-Coder, CodeLlama) with 34 instruction-tuned LLMs developed for Verilog, 19 with open weights. The review identifies a shift toward execution-based evaluation (functional-pass@k) as the dominant metric, and categorizes techniques into training-free (EDA feedback, prompt engineering) and training-based (SFT, RL) approaches, while highlighting critical gaps in security, efficiency alignment, copyright protection, and hallucination mitigation.",
     32   "checklist": {
     33     "artifacts": {
     34       "code_released": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No repository URL, analysis code, or supplementary materials are released. The survey could have released its search queries, coding spreadsheets, or analysis scripts but did not."
     38       },
     39       "data_released": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No structured dataset of the 102 reviewed papers, extracted data, or coding results is released. The reference list provides paper titles but not the coded attributes or quality assessment scores."
     43       },
     44       "environment_specified": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No environment specifications are provided. While this is a survey without computational experiments, any analysis scripts or data processing tools used are not specified."
     48       },
     49       "reproduction_instructions": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "While Section 3 describes the search methodology in detail (databases, keywords, filtering stages), there are no step-by-step instructions for reproducing the exact search results or data extraction process."
     53       }
     54     },
     55     "statistical_methodology": {
     56       "confidence_intervals_or_error_bars": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "This is a qualitative systematic literature review with descriptive counts and categorizations, not statistical aggregation requiring confidence intervals."
     60       },
     61       "significance_tests": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "The survey performs no statistical tests; it categorizes and describes trends qualitatively."
     65       },
     66       "effect_sizes_reported": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No effect sizes are computed; the survey reports paper counts and categorical breakdowns."
     70       },
     71       "sample_size_justified": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No statistical sampling requiring power analysis; the corpus size (102 papers) is determined by the systematic search process."
     75       },
     76       "variance_reported": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No experimental runs are conducted; variance reporting is inapplicable to this survey format."
     80       }
     81     },
     82     "evaluation_design": {
     83       "baselines_included": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 1 explicitly compares this survey against 6 related literature reviews across dimensions of year, paper count, topics, and Verilog focus, positioning the contribution relative to prior work."
     87       },
     88       "baselines_contemporary": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The compared surveys are from 2024-2025 (Jiang et al. 2024, Joel et al. 2024, Fang et al. 2025, He et al. 2024, Chen et al. 2024-2025), representing the current state of the field."
     92       },
     93       "ablation_study": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "This is a survey paper with no system components to ablate."
     97       },
     98       "multiple_metrics": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "No experiments are conducted requiring evaluation metrics."
    102       },
    103       "human_evaluation": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "No system outputs to evaluate; this is a literature review."
    107       },
    108       "held_out_test_set": {
    109         "applies": false,
    110         "answer": false,
    111         "justification": "No experiments requiring train/test splits."
    112       },
    113       "per_category_breakdown": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The survey provides detailed breakdowns by LLM type (open vs. closed source, base vs. instruction-tuned), by year, by venue (Figure 4), by dataset type (Tables 6-7), by technique category (training-free vs. training-based), and by alignment dimension (Section 7)."
    117       },
    118       "failure_cases_discussed": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 8.1 discusses four categories of limitations: foundation/knowledge gaps, data/benchmark scarcity, evaluation/alignment deficits, and deployment readiness challenges."
    122       },
    123       "negative_results_reported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper identifies significant negative findings: no current solution integrates into industrial EDA workflows (Section 8.1), existing benchmarks are too small (<100 samples), current metrics ignore critical hardware dimensions like PPA and security, and alignment research is nascent."
    127       }
    128     },
    129     "claims_and_evidence": {
    130       "abstract_claims_supported": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The abstract claims to analyze 102 papers (70 published, 32 preprints) structured around 4 RQs, which is fully supported by the content. The claim of being the first comprehensive survey on LLM-based Verilog generation is supported by Table 1's comparison with existing reviews."
    134       },
    135       "causal_claims_justified": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "The paper makes no causal claims of its own. It describes trends ('growth from 12 to 274 usages'), categorizes techniques, and summarizes findings from reviewed papers without asserting causal relationships from its own analysis."
    139       },
    140       "generalization_bounded": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The scope is clearly bounded to LLM-based Verilog code generation (not HDLs generally), covering papers from 2020 to October 2025. The title and abstract accurately reflect this specific focus."
    144       },
    145       "alternative_explanations_discussed": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "This is a pure survey/taxonomy that presents descriptive categorizations rather than empirical results requiring alternative explanations."
    149       },
    150       "proxy_outcome_distinction": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper's claims match the granularity of its measurements. It reports paper counts, categorizations, and trend descriptions, and frames them exactly as such without overreaching to broader claims about the field's overall health or impact."
    154       }
    155     },
    156     "setup_transparency": {
    157       "model_versions_specified": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "The survey does not use any LLMs in its own methodology; it reviews papers that use LLMs."
    161       },
    162       "prompts_provided": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No prompting is used in the survey's methodology."
    166       },
    167       "hyperparameters_reported": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No experiments requiring hyperparameters are conducted."
    171       },
    172       "scaffolding_described": {
    173         "applies": false,
    174         "answer": false,
    175         "justification": "No agentic scaffolding is used in the survey's methodology."
    176       },
    177       "data_preprocessing_documented": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 3 thoroughly documents the paper selection pipeline: six databases searched with specific keyword sets (Section 3.2.1), three-stage filtering (5,172 → 687 → 124 papers) with explicit inclusion/exclusion criteria (Table 2), quality assessment with 5 QAC criteria and a 12/15 threshold (Table 3), and snowballing (186 candidates → 15 additions → 102 final)."
    181       }
    182     },
    183     "limitations_and_scope": {
    184       "limitations_section_present": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 9 'Threats to Validity' provides a dedicated discussion covering three specific threats: paper search omission, study selection bias, and categorization/analysis bias."
    188       },
    189       "threats_to_validity_specific": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The threats are specific to this study: 'the fast-paced nature of LLM research may lead to overlooking emerging work,' 'reduction from initial candidates to the final set involved subjective quality assessments,' and 'categorizing diverse methodologies introduces potential classification bias.' Mitigations are also described for each."
    193       },
    194       "scope_boundaries_stated": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The scope is explicitly bounded: Verilog code generation specifically (not all HDLs), LLM-based approaches only, publications from 2020 to October 2025, and the paper states it covers 102 papers from specific venues and databases."
    198       }
    199     },
    200     "data_integrity": {
    201       "raw_data_available": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No structured dataset of coded paper attributes, quality assessment scores, or extraction spreadsheets is released. Only the reference list provides partial raw data."
    205       },
    206       "data_collection_described": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 3.2 describes the collection procedure in detail: manual search of 8 venues yielded 16 QGS papers, automated search across 6 databases using specified keyword sets, with temporal scope from 2020 onward."
    210       },
    211       "recruitment_methods_described": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The paper 'recruitment' (search strategy) is described in Section 3.2: manual search of premier venues (AAAI, ACL, ICML, ICLR, NeurIPS, DAC, TCAD), automated search across IEEE Xplore, ACM DL, ScienceDirect, Web of Science, SpringerLink, and arXiv with specified keyword combinations."
    215       },
    216       "data_pipeline_documented": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Figure 3 and Section 3.3 document the full pipeline with counts at each stage: initial screening (→5,172), content-based filtering (→687), full-text assessment (→124), quality assessment (→85), snowballing (+15 → 102 final). Criteria for each stage are specified in Table 2."
    220       }
    221     },
    222     "conflicts_of_interest": {
    223       "funding_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Section 11 acknowledges partial support from the National Natural Science Foundation of China (NSFC, No. 62141208)."
    227       },
    228       "affiliations_disclosed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "All author affiliations are listed: Zhejiang University, Northwestern Polytechnical University, Nantong University, Monash University, and Singapore Management University. None appear to be affiliated with companies whose products are being reviewed."
    232       },
    233       "funder_independent_of_outcome": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "NSFC is a Chinese government research funding agency with no commercial stake in the survey's conclusions about LLM-based Verilog code generation."
    237       },
    238       "financial_interests_declared": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No competing interests or financial interests statement is included in the paper."
    242       }
    243     },
    244     "contamination": {
    245       "training_cutoff_stated": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "This is a survey paper that does not evaluate any pre-trained model's capability on benchmarks."
    249       },
    250       "train_test_overlap_discussed": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    254       },
    255       "benchmark_contamination_addressed": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    259       }
    260     },
    261     "human_studies": {
    262       "pre_registered": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this systematic literature review."
    266       },
    267       "irb_or_ethics_approval": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this systematic literature review."
    271       },
    272       "demographics_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this systematic literature review."
    276       },
    277       "inclusion_exclusion_criteria": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this systematic literature review."
    281       },
    282       "randomization_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this systematic literature review."
    286       },
    287       "blinding_described": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in this systematic literature review."
    291       },
    292       "attrition_reported": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "No human participants in this systematic literature review."
    296       }
    297     },
    298     "cost_and_practicality": {
    299       "inference_cost_reported": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "This is a survey paper with no computational method of its own."
    303       },
    304       "compute_budget_stated": {
    305         "applies": false,
    306         "answer": false,
    307         "justification": "This is a survey paper with no computational experiments."
    308       }
    309     },
    310     "survey_methodology": {
    311       "prisma_or_structured_protocol": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "The paper follows the Kitchenham SLR methodology (reference [59]) with the Quasi-Gold Standard (QGS) strategy (reference [133]). Section 3 describes the full protocol: planning (RQs, search strategy), conducting (three-stage filtering, quality assessment, snowballing), and analysis, with a systematic flow diagram (Figure 3)."
    315       },
    316       "quality_assessment_of_sources": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Table 3 provides five Quality Assessment Criteria (QAC1-5) scored on a 0-3 scale with a 12/15 (80%) inclusion threshold. Criteria cover venue prestige, contribution, workflow clarity, experimental detail, and argument support. Preprints lacking venue scores must demonstrate exceptional merit in other dimensions."
    320       },
    321       "publication_bias_discussed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper does not discuss publication bias. Section 9 addresses search omission, selection bias, and categorization bias, but never considers whether the surveyed literature skews toward positive results or whether negative findings about LLM-based Verilog generation are underrepresented."
    325       }
    326     }
    327   },
    328   "claims": [
    329     {
    330       "claim": "This is the first comprehensive systematic literature review specifically focused on LLM-based Verilog code generation.",
    331       "evidence": "Table 1 compares against 6 related surveys, showing none provide full Verilog focus; prior surveys either cover general code generation (Jiang 2024, Joel 2024, Chen 2025) or broad EDA (Fang 2025, He 2024, Chen 2024) with only partial Verilog coverage.",
    332       "supported": "strong"
    333     },
    334     {
    335       "claim": "Research in LLM-based Verilog code generation has grown exponentially from 1 paper in 2020 to 66 by September 2025.",
    336       "evidence": "Figure 4b shows year-by-year counts: 1 (2020), 0 (2021), 0 (2022), 6 (2023), 29 (2024), 66 (2025). This quantitative evidence directly from the systematic search supports the exponential growth claim.",
    337       "supported": "strong"
    338     },
    339     {
    340       "claim": "Open-source Base LLM usages grew 233% from 2024 to 2025 (46 to 153 mentions), with Llama, DeepSeek, and Qwen becoming the dominant families.",
    341       "evidence": "Table 4 provides detailed counts by LLM family and year. Llama: 0→15→49, DeepSeek: 0→6→48, Qwen: 0→3→25 across 2023-2025.",
    342       "supported": "strong"
    343     },
    344     {
    345       "claim": "GPT series dominates closed-source usage with 149 of 179 closed-source mentions (83.2%).",
    346       "evidence": "Table 4 reports GPT series at 7+42+100=149 mentions vs. Claude (19), Gemini (5), and Others (6) totaling 179 closed-source mentions.",
    347       "supported": "strong"
    348     },
    349     {
    350       "claim": "34 instruction-tuned LLMs have been developed for Verilog, with 19 (55.9%) providing open weights, predominantly built on code-specialized foundations.",
    351       "evidence": "Table 5 lists all 34 IT LLMs with their foundation models, weight availability, and URLs. 28/34 (82.4%) use coding-oriented foundations, with DeepSeek-Coder (11 models), Qwen coder (9), and CodeLlama (8) as the top clusters.",
    352       "supported": "strong"
    353     },
    354     {
    355       "claim": "Current research has critical gaps in deployment readiness, with no solution integrating seamlessly into industrial EDA workflows.",
    356       "evidence": "Section 8.1(4) states this limitation, identifying lacks in interactive refinement, explainability, and human-in-the-loop features. This is a qualitative assessment synthesized from the reviewed literature.",
    357       "supported": "moderate"
    358     }
    359   ],
    360   "red_flags": [
    361     {
    362       "flag": "Publication bias not addressed",
    363       "detail": "Despite being a systematic review, the paper never discusses whether the surveyed literature skews toward positive results about LLM-based Verilog generation. No funnel plots, no discussion of negative-result underrepresentation, and no acknowledgment that the field may systematically over-report successes. This is especially concerning for a fast-moving field with strong publication incentives."
    364     },
    365     {
    366       "flag": "Quality scores not reported per paper",
    367       "detail": "While the paper uses a 5-criterion quality assessment (QAC) for inclusion decisions, the quality scores of individual papers are not reported. This means the synthesis treats all 102 included papers as equally reliable, without weighting conclusions by the methodological strength of their sources."
    368     },
    369     {
    370       "flag": "No artifacts released for a survey",
    371       "detail": "No structured dataset, search query logs, coding spreadsheets, or analysis scripts are released. The survey's extraction and categorization cannot be independently verified or built upon, limiting its value as a living resource."
    372     }
    373   ],
    374   "cited_papers": [
    375     {
    376       "title": "Evaluating Large Language Models Trained on Code",
    377       "authors": ["Mark Chen", "Jerry Tworek"],
    378       "year": 2021,
    379       "arxiv_id": "2107.03374",
    380       "relevance": "Foundational paper on Codex for code generation, establishing the paradigm of LLM-based code synthesis that extends to hardware description languages."
    381     },
    382     {
    383       "title": "Code Llama: Open Foundation Models for Code",
    384       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    385       "year": 2023,
    386       "arxiv_id": "2308.12950",
    387       "relevance": "Major open-source code model family that is the most widely adopted foundation for Verilog-specific instruction tuning (64 mentions in the survey)."
    388     },
    389     {
    390       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    391       "authors": ["Daya Guo", "Qihao Zhu"],
    392       "year": 2024,
    393       "arxiv_id": "2401.14196",
    394       "relevance": "Code-specialized foundation model that is the most common base for Verilog instruction-tuned LLMs (11 of 34 IT models)."
    395     },
    396     {
    397       "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation",
    398       "authors": ["Mingjie Liu", "Nathaniel Ross Pinckney"],
    399       "year": 2023,
    400       "doi": "10.1109/ICCAD57390.2023.10323812",
    401       "relevance": "Key benchmark for evaluating LLM-based Verilog generation, derived from HDLBits with 156 human-crafted and 143 machine-generated problems."
    402     },
    403     {
    404       "title": "VeriGen: A Large Language Model for Verilog Code Generation",
    405       "authors": ["Shailja Thakur", "Baleegh Ahmad"],
    406       "year": 2024,
    407       "doi": "10.1145/3643681",
    408       "relevance": "Early open-source LLM specifically fine-tuned for Verilog generation, with released weights and training data."
    409     },
    410     {
    411       "title": "RTLCoder: Fully Open-Source and Efficient LLM-Assisted RTL Code Generation Technique",
    412       "authors": ["Shang Liu", "Wenji Fang"],
    413       "year": 2025,
    414       "doi": "10.1109/TCAD.2024.3483089",
    415       "relevance": "Open-weight RTL code generation model using LLM-synthesized training data, demonstrating data-centric approaches to Verilog fine-tuning."
    416     },
    417     {
    418       "title": "AutoVCoder: A Systematic Framework for Automated Verilog Code Generation using LLMs",
    419       "authors": ["Mingzhe Gao", "Jieru Zhao"],
    420       "year": 2024,
    421       "doi": "10.1109/ICCD63220.2024.00033",
    422       "relevance": "Framework combining large-scale data curation (1M samples) with curriculum learning for Verilog generation."
    423     },
    424     {
    425       "title": "CodeV: Empowering LLMs with HDL Generation through Multi-Level Summarization",
    426       "authors": ["Yang Zhao", "Di Huang"],
    427       "year": 2025,
    428       "doi": "10.1109/TCAD.2025.3604320",
    429       "relevance": "Multi-level summarization approach for HDL code generation with MinHash deduplication for contamination control."
    430     },
    431     {
    432       "title": "MAGE: A Multi-Agent Engine for Automated RTL Code Generation",
    433       "authors": ["Yujie Zhao", "Hejia Zhang"],
    434       "year": 2025,
    435       "doi": "10.1109/DAC63849.2025.11133191",
    436       "relevance": "Multi-agent framework with four specialized agents for Verilog generation, representing the trend toward agentic approaches in hardware design."
    437     },
    438     {
    439       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    440       "authors": ["Xinyi Hou", "Yanjie Zhao"],
    441       "year": 2024,
    442       "doi": "10.1145/3695988",
    443       "relevance": "Comprehensive SLR on LLMs for general software engineering that serves as a methodological baseline for this Verilog-specific survey."
    444     },
    445     {
    446       "title": "DeepRTL: Bridging Verilog Understanding and Generation with a Unified Representation Model",
    447       "authors": ["Yi Liu", "Changran Xu"],
    448       "year": 2025,
    449       "relevance": "Unified model for both Verilog understanding and generation with Chain-of-Thought annotation and multi-stage expert review achieving 90% annotation accuracy."
    450     },
    451     {
    452       "title": "VeriReason: Reinforcement Learning with Testbench Feedback for Reasoning-Enhanced Verilog Generation",
    453       "authors": ["Yiting Wang", "Guoheng Sun"],
    454       "year": 2025,
    455       "arxiv_id": "2505.11849",
    456       "relevance": "Combines reinforcement learning (GRPO) with AST-based rewards for reasoning-enhanced Verilog code generation."
    457     }
    458   ],
    459   "engagement_factors": {
    460     "practical_relevance": {
    461       "score": 1,
    462       "justification": "Provides a useful reference map of 34 instruction-tuned LLMs and 27 benchmarks for practitioners in hardware design automation, but is not directly usable as a tool or technique."
    463     },
    464     "surprise_contrarian": {
    465       "score": 0,
    466       "justification": "Findings confirm expected trends (LLM adoption growing, GPT dominant in closed-source, open-source catching up) without challenging any conventional wisdom."
    467     },
    468     "fear_safety": {
    469       "score": 1,
    470       "justification": "Section 7.1 discusses security risks including hardware trojans from data poisoning and benchmark contamination, but these are reviewed rather than newly demonstrated."
    471     },
    472     "drama_conflict": {
    473       "score": 0,
    474       "justification": "No controversy or conflict; the paper is a neutral categorization of existing work."
    475     },
    476     "demo_ability": {
    477       "score": 0,
    478       "justification": "No code, demo, or interactive tool is released alongside the survey."
    479     },
    480     "brand_recognition": {
    481       "score": 1,
    482       "justification": "Authors from Zhejiang University and Singapore Management University are known in SE research but not household names; David Lo is well-known in the SE community."
    483     }
    484   }
    485 }

Impressum · Datenschutz