ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27446B)


      1 {
      2   "paper": {
      3     "title": "LeakSealer: A Semisupervised Defense for LLMs Against Prompt Injection and Leakage Attacks",
      4     "authors": [
      5       "Francesco Panebianco",
      6       "Stefano Bonfanti",
      7       "Francesco Trovò",
      8       "Michele Carminati"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2508.00602",
     13     "doi": "10.48550/arXiv.2508.00602"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The paper states 'LeakSealer's source code will be publicly available' (Section 5.1) — this is a promise of future release. No repository URL is provided."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper says 'We will open-source the dataset as a benchmark for future research' (Section 6.1). The PII dataset is only promised, not released. The OpenAI and ToxicChat datasets used are public, but the paper's novel contribution (PII dataset) is not yet available."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Hardware is listed (Tesla P100 GPU, 16GB VRAM, Intel Xeon CPU) and fp16 precision is mentioned (Section 6), but no requirements.txt, Dockerfile, or library version specifications are provided."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating results are provided."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Tables 1, 2, and 3 report only point estimates (e.g., purity 0.97, AUPRC 0.97). No confidence intervals, error bars, or ± notation appear anywhere."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims LeakSealer 'outperforms' and 'significantly surpasses' baselines based solely on comparing metric values. No statistical significance tests (p-values, t-tests, etc.) are performed."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Results provide baseline context: e.g., 'AUPRC of 0.97' vs Llama Guard's 0.84 (Table 3), recall 0.88 vs second-best 0.65 (Table 2). Both method and baseline numbers are reported, allowing relative comparison."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification for dataset sizes. The PII dataset (N=1,048) and the use of existing datasets (N=1,680 and N=10,166) are not justified by power analysis or other means."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance, standard deviation, or multiple-run results are reported. All results appear to be from single experimental runs."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Five baselines are compared: Llama Guard 3 8B, and LLM-As-A-Judge with DeepSeek-R1, GPT-4o, Ministral 8B, and Llama 3.1 8B (Section 6)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include Llama Guard 3 (2024), DeepSeek-R1 (2025), GPT-4o (2024/2025), Ministral 8B (2024) — all recent and competitive."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "LeakSealer has multiple components (embedding, PCA, UMAP, HDBSCAN, classifier selection) but no ablation study is performed to measure individual component contributions."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple metrics reported: Accuracy, Precision, Recall, F1-score, AUPRC, and Purity (Tables 1-3)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "All evaluation is automated using labeled datasets. The human labeling of the PII dataset is data annotation, not evaluation of the system's outputs. No human evaluation of LeakSealer's outputs is performed."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Dynamic setting uses 'random 80%-20% split for the training and test sets' (Section 6.3). Static setting uses the ToxicChat test split (Appendix A)."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results broken down by dataset (OpenAI, ToxicChat, PII) and by baseline method in Tables 1-3. Static vs dynamic settings also shown separately."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 7 discusses where LeakSealer underperforms: on the smaller OpenAI dataset, GPT-4o and Llama Guard outperform it on several metrics. The paper attributes this to dataset size and potential pre-training bias in LLM baselines."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper honestly reports that on the OpenAI dataset, Llama Guard 3 achieves better AUPRC (0.90 vs 0.83) and GPT-4o achieves better recall (0.94 vs 0.75) in the dynamic setting (Table 3)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims of highest precision/recall on ToxicChat and AUPRC of 0.97 on PII dataset are supported by Tables 1-3. The claim about outperforming Llama Guard (0.84) is confirmed in Table 3."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes implicit causal claims ('This shows the effectiveness of employing HDBSCAN's exemplars', Section 6.2) without ablation studies or controlled experiments to isolate what drives performance. The multi-component pipeline is not decomposed."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title claims defense 'Against Prompt Injection and Leakage Attacks' broadly, but testing is limited to toxicity detection and one specific PII leakage scenario. The paper acknowledges toxicity as 'a representative measure' in limitations but doesn't bound the generalization of the title claims."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 7 discusses alternative explanations: LLM baselines may benefit from pre-training on public datasets (potential overfitting), dataset size may explain OpenAI performance differences, and class imbalance may inflate accuracy for some baselines."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper acknowledges in limitations that 'toxicity of generated text was adopted as a representative measure for evaluating unsafe behaviors' — explicitly identifying the proxy gap between what is measured (toxicity detection) and the broader claim (defense against prompt injection)."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Models listed as 'Llama Guard 3 8B', 'Llama 3.1 8B', 'Ministral 8B', 'GPT-4o' (Section 6). GPT-4o has no snapshot date or API version. Llama models specify size (8B) but not specific checkpoint versions."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Full system prompts for all LLM-As-A-Judge baselines are provided in Appendix D (Sections D.1-D.3). Email generation prompts are shown in Listing 1 (Appendix C)."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Email generation hyperparameters reported (temperature 0.7, top_p 0.9, etc. in Appendix C). Embedding dimensions (PCA to 50, UMAP to 10) specified in Appendix B. However, classification model hyperparameters (SVM, RF, XGBoost, k-NN) and LLM inference settings (temperature for baselines) are not reported."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "LeakSealer is a clustering/classification pipeline, not an agentic scaffolding system. No agentic scaffolding is used."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Preprocessing pipeline is documented in Appendix B: stella_en_400M_v5 embeddings (e=1024) → PCA to 50 dimensions → UMAP to 10 dimensions (static) or PCA only (dynamic) → HDBSCAN clustering. Data generation pipeline for PII dataset is detailed in Appendix C."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "A 'Limitations and Future Work' subsection exists in Section 8 (Conclusions) but contains only ~3 sentences, mostly forward-looking ('Future research directions include...'). This does not constitute substantive discussion of limitations."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The only specific limitation mentioned is using toxicity as a proxy for unsafe behavior. No discussion of threats such as: simulated (not real) human feedback, selection bias in the PII dataset, limited attack diversity, or generalizability to other domains."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not explicitly state what the results do NOT show. No specific exclusions or untested scenarios are enumerated."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The novel PII dataset is only promised for future release ('will be open-sourced'). No download link is provided. The OpenAI and ToxicChat datasets are publicly available but the paper's own data contribution is not."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "PII dataset creation is described in extensive detail in Section 6.1 and Appendix C: document generation methods (procedural, LLM-generated), identity generation via barnum library, manual review process, RAG system setup, and labeling procedure."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants in the study. Data is synthetically generated and publicly available benchmarks are used."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The full pipeline is documented: identity generation (barnum) → document creation (procedural/LLM) → manual review → RAG database population → question generation → LLM interaction → human labeling → dataset (N=1,048 interactions). Appendix C provides detailed steps for each document type."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding section or acknowledgments mentioning grants or sponsors appear anywhere in the paper."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: three authors from Politecnico di Milano, one from ML cube (a company)."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding is disclosed, so independence cannot be assessed. One author is from ML cube (a company that may have commercial interest in LLM security solutions), but this is not discussed."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests statement is present. One author is affiliated with ML cube (a company), but no declaration of financial interests, patents, or equity is made."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "The paper tests a defense framework (LeakSealer), not a pre-trained model's capability on benchmarks. The LLMs are used as baselines for comparison, but the core contribution is a semi-supervised clustering/classification approach."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Same as above — the paper evaluates a defense system, not model knowledge. However, the paper does note potential contamination bias in LLM baselines (Section 7)."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "LeakSealer is a defense framework, not a pre-trained model evaluated on benchmark knowledge. Contamination does not apply to the primary contribution."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in the study. The PII dataset uses synthetic data with human labeling, but this is not a human subjects study."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. All data is synthetic."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper claims LeakSealer is 'lightweight' and uses efficient ML architectures (Section 7, 'Performance remarks') but provides no specific latency, cost per example, or timing measurements."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "Hardware is described (Tesla P100 GPU, 16GB VRAM) but total computational budget (GPU hours, training time, total API spend for baseline LLM evaluations) is not quantified."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is not stated. Results are presented without indicating how many runs produced them."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The paper mentions 'nested cross-validation' for model selection among SVM, RF, XGBoost, and k-NN (Appendix B) but does not report the search budget, configurations tried, or compute spent on search."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "The classification model is 'robustly selected through a nested cross-validation procedure' among SVM, RF, XGBoost, and k-NN (Appendix B). Nested CV is a principled selection method that avoids overfitting to the test set."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Multiple comparisons across 5 baselines, 3 datasets, and 2 settings are made with no correction for multiple comparisons."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors evaluate their own system against baselines without acknowledging author-evaluation bias. No independent evaluation or discussion of this bias."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper qualitatively claims LeakSealer is lighter than LLM-based approaches (Section 7) but provides no quantitative compute comparison. LLM baselines require full inference vs lightweight ML classifiers, but this is not measured."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "The paper acknowledges in limitations that 'toxicity of generated text was adopted as a representative measure for evaluating unsafe behaviors' (Section 8), explicitly identifying the gap between the benchmark measure and the broader defense claim."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No scaffolding is involved. LeakSealer is a clustering/classification pipeline, not an agentic system."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 6.2 and 7 discuss that 'LLMs are trained on web-crawled data, which raises concerns regarding potential overfitting due to the availability of certain datasets online.' The paper notes LeakSealer avoids this bias via its semi-supervised approach without pre-training."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether evaluation features contain information not available at prediction time."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The 80/20 random split for dynamic evaluation does not address whether train and test examples share structural similarities (e.g., similar attack patterns, same document types in PII dataset)."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The contamination concern is raised conceptually but not tested."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "In the static setting, LeakSealer achieves the highest F1-score (0.77) on both ToxicChat and PII datasets, outperforming all baselines.",
    368       "evidence": "Table 2 shows F1 scores: LeakSealer 0.77 on ToxicChat (vs second-best 0.61) and 0.77 on PII (vs second-best 0.72 for GPT-4o).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "In the dynamic setting, LeakSealer achieves AUPRC of 0.97 on PII leakage detection, outperforming Llama Guard (0.84).",
    373       "evidence": "Table 3 and Figure 4c show AUPRC 0.97 for LeakSealer vs 0.84 for Llama Guard 3 on the PII dataset.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "LeakSealer's recall for PII detection (0.88 static, 0.95 dynamic) significantly exceeds all baselines.",
    378       "evidence": "Table 2: recall 0.88 vs second-best 0.65 (Ministral). Table 3: recall 0.95 vs second-best 0.59 (GPT-4o and Ministral).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "LeakSealer is not affected by pre-training bias that may inflate LLM baseline performance on public datasets.",
    383       "evidence": "Section 7 argues LeakSealer's semi-supervised pipeline avoids pre-training on web data. However, this is a plausibility argument without empirical verification.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Cluster purity reaches 0.97 on ToxicChat, demonstrating effective separation of safe and unsafe interactions.",
    388       "evidence": "Table 1 reports purity 0.97 on ToxicChat. The paper notes this may partly reflect class imbalance (92% safe, 8% unsafe).",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "LeakSealer is a semisupervised framework combining clustering-based static analysis with HITL-informed dynamic defense for detecting prompt injection and PII leakage in LLM systems. On the PII leakage task, it achieves AUPRC of 0.97 vs 0.84 for Llama Guard, with notably higher recall (0.95 vs 0.31). Performance on ToxicChat is also superior (F1 0.72 vs 0.48 for Llama Guard). On the smaller OpenAI dataset, results are mixed, with Llama Guard achieving higher AUPRC (0.90 vs 0.83). The paper also introduces a curated PII leakage dataset (N=1,048) with labeled RAG interactions.",
    394   "red_flags": [
    395     {
    396       "flag": "No error bars or variance across runs",
    397       "detail": "All results are reported as point estimates from apparently single runs. No standard deviations, confidence intervals, or seed sensitivity analysis. The 80/20 random split could produce different results with different random seeds, but this is never tested."
    398     },
    399     {
    400       "flag": "Simulated human-in-the-loop",
    401       "detail": "The HITL pipeline uses ground truth labels from the dataset to simulate human feedback (Appendix B: 'the human feedback is simulated using the ground truth from the dataset'). Real-world HITL performance — where human annotators may disagree, make errors, or face ambiguous cases — is untested."
    402     },
    403     {
    404       "flag": "Code and data promised but not released",
    405       "detail": "Both LeakSealer source code and the PII dataset are described as 'will be' released. No repository URLs or download links are provided. Reproducibility cannot be verified."
    406     },
    407     {
    408       "flag": "Self-created benchmark evaluated by same team",
    409       "detail": "The PII dataset is created, labeled, and evaluated by the same research team with no independent validation. The dataset has not been previously published and thus could not introduce pre-training bias, but this also means no external quality check exists."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond",
    415       "authors": ["Jingfeng Yang", "Hongye Jin", "Ruixiang Tang"],
    416       "year": 2024,
    417       "relevance": "Survey on LLM capabilities and deployment patterns, relevant to understanding the landscape of LLM applications."
    418     },
    419     {
    420       "title": "Retrieval-augmented generation for large language models: A survey",
    421       "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"],
    422       "year": 2023,
    423       "arxiv_id": "2312.10997",
    424       "relevance": "Survey on RAG systems, directly relevant to the RAG context vulnerability that LeakSealer addresses."
    425     },
    426     {
    427       "title": "Do Anything Now: Characterizing and Evaluating In-the-Wild Jailbreak Prompts on Large Language Models",
    428       "authors": ["Xinyue Shen", "Zeyuan Chen", "Michael Backes"],
    429       "year": 2024,
    430       "relevance": "Systematic analysis of jailbreak prompts in the wild, foundational work on the threat model LeakSealer defends against."
    431     },
    432     {
    433       "title": "The good and the bad: Exploring privacy issues in retrieval-augmented generation (RAG)",
    434       "authors": ["Shenglai Zeng", "Jiankun Zhang", "Pengfei He"],
    435       "year": 2024,
    436       "relevance": "Demonstrates PII leakage attacks against RAG systems, the specific threat scenario that motivates LeakSealer's PII defense."
    437     },
    438     {
    439       "title": "Llama guard: LLM-based input-output safeguard for human-ai conversations",
    440       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    441       "year": 2023,
    442       "arxiv_id": "2312.06674",
    443       "relevance": "Primary baseline for LeakSealer comparison; LLM-based safety classification approach that LeakSealer aims to outperform."
    444     },
    445     {
    446       "title": "An empirical study of llm-as-a-judge for llm evaluation: Fine-tuned judge models are task-specific classifiers",
    447       "authors": ["Hui Huang", "Yingqi Qu", "Jing Liu"],
    448       "year": 2024,
    449       "arxiv_id": "2403.02839",
    450       "relevance": "Evaluates LLM-as-a-judge approach used as a baseline in LeakSealer experiments."
    451     },
    452     {
    453       "title": "Prompt injection attacks and defenses in llm-integrated applications",
    454       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng"],
    455       "year": 2023,
    456       "arxiv_id": "2310.12815",
    457       "relevance": "Framework for formalizing prompt injection attacks and benchmarking defenses, directly relevant to LLM security evaluation."
    458     },
    459     {
    460       "title": "ProPILE: Probing Privacy Leakage in Large Language Models",
    461       "authors": ["Siwon Kim", "Sangdoo Yun", "Hwaran Lee"],
    462       "year": 2024,
    463       "relevance": "Tool for probing PII leakage from LLM training data, addresses the pre-training leakage problem distinct from RAG leakage."
    464     },
    465     {
    466       "title": "Whispers in the Machine: Confidentiality in LLM-integrated Systems",
    467       "authors": ["Jonathan Evertz", "Merlin Chlosta", "Lea Schönherr"],
    468       "year": 2024,
    469       "arxiv_id": "2402.06922",
    470       "relevance": "Explores confidentiality issues in LLM systems using secret key extraction games, a related but distinct framing of the leakage problem."
    471     },
    472     {
    473       "title": "Defending Large Language Models Against Jailbreak Attacks via Layer-specific Editing",
    474       "authors": ["Wei Zhao", "Zhe Li", "Yige Li"],
    475       "year": 2024,
    476       "arxiv_id": "2405.18166",
    477       "relevance": "White-box jailbreak defense via model layer editing, contrasted with LeakSealer's model-agnostic approach."
    478     },
    479     {
    480       "title": "PARDEN, Can You Repeat That? Defending against Jailbreaks via Repetition",
    481       "authors": ["Ziyang Zhang", "Qizhen Zhang", "Jakob Foerster"],
    482       "year": 2024,
    483       "arxiv_id": "2405.07932",
    484       "relevance": "Repetition-based jailbreak defense that requires repeated inference, contrasting with LeakSealer's lightweight approach."
    485     },
    486     {
    487       "title": "ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World User-AI Conversation",
    488       "authors": ["Zi Lin", "Zihan Wang", "Yongqi Tong"],
    489       "year": 2023,
    490       "relevance": "Benchmark dataset used in LeakSealer evaluation for toxicity detection in real-world AI conversations."
    491     }
    492   ]
    493 }

Impressum · Datenschutz