ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (31208B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LeakSealer: A Semisupervised Defense for LLMs Against Prompt Injection and Leakage Attacks",
      6     "authors": [
      7       "Francesco Panebianco",
      8       "Stefano Bonfanti",
      9       "Francesco Trovò",
     10       "Michele Carminati"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2508.00602",
     15     "doi": "10.48550/arXiv.2508.00602"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's main claims—highest F1 on ToxicChat in static setting and AUPRC 0.97 for PII leakage in dynamic setting—are supported by Tables 2 and 3 respectively. The 'highest precision and recall' phrasing is slightly imprecise since GPT-4o achieves higher recall (0.94) on OpenAI dataset, but ToxicChat-specific claims hold.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper makes causal claims that LeakSealer's design choices (clustering, semi-supervised labeling, lightweight classifiers) cause better detection, but no ablation study isolates which components drive the gains. Comparative benchmarks show correlation, not causation.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper presents LeakSealer as a general 'model-agnostic framework' for LLM deployment but evaluates on only 3 datasets with simulated human feedback. Claims of general applicability are not bounded to the tested settings or threat models.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper briefly notes LLM baselines may have seen benchmark datasets during pretraining, but does not explore whether LeakSealer's advantage could stem from embedding model choice, class imbalance artifacts, or the use of simulated rather than real human feedback.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper clearly measures precision, recall, F1, and AUPRC on labeled detection tasks and claims detection capability—the metrics are direct measures of the stated task without conflation.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is a brief 'Limitations and Future Work' paragraph embedded within the Conclusions section rather than a dedicated separate section, which does not meet the criterion of a dedicated limitations section.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The limitations paragraph mentions using 'toxicity as a representative measure' and future online training work, but does not specify concrete threats such as class imbalance effects, impact of simulating human feedback, or dataset size constraints on reliability.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper notes that white-box defenses are excluded and only black-box interaction is evaluated, but does not explicitly state what the results do NOT show (e.g., non-RAG architectures, non-English inputs, adversarial embedding attacks).",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources, grants, or acknowledgements section appears in the paper. One author (Bonfanti) is affiliated with ML cube (an industry entity) but no research funding is disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are listed on the title page: three authors at Politecnico di Milano and Bonfanti at ML cube (an industry partner), clearly disclosing academic and industrial affiliations.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funder is identified, so independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosures, or financial conflict declarations are present anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: Prompt Injection, PII Leakage, RAG, and the threat model are formally defined in Sections 2 and 4. PII leakage is formalized mathematically in Section 4.2 with explicit probability notation.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly lists three contributions in the 'Original Contributions' bullet list: (1) historical analysis methodology, (2) LeakSealer framework with static and dynamic modes, and (3) a curated PII dataset.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 3 and the 'Discussion of Prior Work' subsection explicitly compare with Llama Guard, PARDEN, LED, ProPILE, and RAG privacy work, explaining how LeakSealer addresses each approach's limitations (computational cost, architecture dependency, dataset bias).",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The paper body states 'LeakSealer's source code will be publicly available' (future tense), indicating code was not released at submission time despite the abstract claiming open-source release.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The paper states 'We will open-source the dataset as a benchmark for future research' (future tense); the custom PII dataset was not available at submission. The third-party benchmarks (ToxicChat, OpenAI Content Moderation) are publicly available.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Hardware specs (Tesla P100 GPU, Intel Xeon CPU) and model names are provided, but no requirements.txt, Dockerfile, or Python package versions are specified beyond named libraries (HDBSCAN, UMAP, stella embedding model).",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Appendix B provides algorithmic details and Appendix D provides full prompts, but without released code, reproducing the full pipeline requires significant re-implementation without step-by-step commands or scripts.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Tables 1, 2, and 3 report only point estimates for all metrics (precision, recall, F1, AUPRC, purity, accuracy) with no confidence intervals or error bars.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to any comparative results despite multiple cross-method comparisons being made throughout the paper.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute metric differences are reported (e.g., AUPRC 0.97 vs 0.84 for Llama Guard on PII; recall 0.88 vs 0.65 for best baseline in static PII), allowing effect sizes to be directly calculated from presented numbers.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Dataset sizes are stated (1,680 OpenAI; 10,166 ToxicChat; 1,048 PII) but no power analysis or statistical justification for why these sample sizes are sufficient for the claims made is provided.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No standard deviation, variance, or spread across runs is reported for any metric. All results in Tables 1–3 are single-run point estimates.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Multiple baselines are included: Llama Guard 3 8B and four LLM-as-a-Judge models (Llama 3.1 8B, DeepSeek-R1 DistilLlama 8B, Ministral 8B, GPT-4o).",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "All baselines are recent state-of-the-art models from 2023–2025: Llama Guard 3, GPT-4o, DeepSeek-R1, Ministral 8B, and Llama 3.1 8B.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "No ablation study is conducted. The contributions of individual components (embedding model, PCA, UMAP, HDBSCAN, classifier type) to the performance gains are not isolated.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics are reported throughout: cluster purity, accuracy, precision, recall, F1-score, and AUPRC, plus precision-recall curves in Figures 2 and 4.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "The HITL component is central to LeakSealer but Appendix B explicitly states 'the human feedback is simulated using the ground truth from the dataset'—no actual human evaluation of system outputs was conducted.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Section 6.3 states a random 80–20 train-test split is used for the dynamic setting evaluation.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down across three distinct dataset categories (OpenAI Content Moderation, ToxicChat, and the custom PII dataset), each representing a different detection scenario with separate tables.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 7 explicitly discusses the OpenAI dataset as a failure case where Llama Guard 3 outperforms LeakSealer in accuracy, precision, and AUPRC, and proposes explanations (dataset size, potential LLM training contamination).",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "On the OpenAI Content Moderation dataset (Table 3), Llama Guard 3 achieves higher accuracy (0.86), precision (0.77), and AUPRC (0.90) than LeakSealer (0.83, 0.73, 0.83); these results are fully reported.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Most models are named with versions (Llama Guard 3 8B, Llama 3.1 8B, DeepSeek-R1 DistilLlama 8B, Ministral 8B) but GPT-4o is cited only as 'GPT-4o, 2025. Large language model.' without a specific API version or snapshot date.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix D provides complete system prompts for all three LLM-as-a-Judge conditions (OpenAI, ToxicChat, PII datasets), including full example inputs and outputs used in few-shot prompting.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Email generation hyperparameters (temperature 0.7, top_p 0.9, top_k 50) and dimensionality reduction dimensions (PCA 50, UMAP 10) are given, but classifier hyperparameters (SVM kernel/C, RF n_estimators, XGBoost/k-NN parameters) selected by nested cross-validation are not reported per dataset.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The LeakSealer pipeline is described through Algorithm 1, Figure 1, and Appendix B, covering embedding, PCA+UMAP dimensionality reduction, HDBSCAN clustering, exemplar selection, human feedback propagation, and classifier training.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Appendix C provides detailed documentation of the PII dataset construction pipeline: identity generation via barnum library, per-document-type generation procedures, RAG setup with TF-IDF indexing, and human labeling methodology.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The custom PII dataset is promised as a future open-source release but not available at submission. The third-party benchmarks are publicly available but no raw LeakSealer outputs are released.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Appendix C provides detailed description of PII dataset collection: document type generation, LLM models and hyperparameters used, RAG query generation examples, and human labeling criteria with nuance distinction between PII and general information.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participant recruitment—the PII dataset is synthetically generated and labeled by the authors. The benchmark datasets are pre-existing.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Appendix C documents the full pipeline from identity generation (barnum library) through document generation, RAG deployment with TF-IDF, query generation, LLM response recording, and human labeling.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The paper raises contamination concerns for LLM baselines in Section 6.2 but never explicitly states the training data cutoff dates for any evaluated model (GPT-4o, Llama Guard 3, Llama 3.1, etc.).",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 6.2 explicitly states 'LLMs are trained on web-crawled data, which raises concerns regarding potential overfitting due to the availability of certain datasets online,' noting the public benchmarks may appear in LLM pretraining data.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "The paper directly addresses this by noting that the custom PII dataset 'has not been previously published and therefore could not have been incorporated into any training datasets,' and contrasts this with the potentially contaminated public benchmarks.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "The paper qualitatively describes LeakSealer as 'lightweight' and notes it avoids per-sample LLM inference, but no actual latency, throughput, or cost numbers are reported for comparison.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware specs are provided (Tesla P100 GPU, 16 GB VRAM, Intel Xeon CPU) but no total computational budget (GPU hours, training time, total processing time) for any experiment is stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "LeakSealer achieves AUPRC of 0.97 for PII leakage detection in the dynamic setting, significantly outperforming Llama Guard 3 (0.84).",
    374       "evidence": "Table 3 reports LeakSealer AUPRC 0.97 vs Llama Guard 3 AUPRC 0.84 on PII dataset in dynamic setting, with all LLM-as-a-Judge baselines not computing AUPRC.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "In the static setting on ToxicChat, LeakSealer achieves the highest F1-score (0.77) and cluster purity (0.97) among all compared methods.",
    379       "evidence": "Table 2 shows LeakSealer F1 0.77 on ToxicChat vs next best (DeepSeek-R1 and Ministral at 0.61); Table 1 shows purity 0.97.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LeakSealer's recall on the PII dataset (0.88 static, 0.95 dynamic) significantly outperforms all baselines, with the next best being Ministral at 0.65 in static setting.",
    384       "evidence": "Tables 2 and 3 directly report these recall values; the 0.65 Ministral comparison is explicitly stated in Section 6.2.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "LeakSealer is computationally more efficient than LLM-based defenses because it uses lightweight ML classifiers that do not require GPU inference per sample.",
    389       "evidence": "Section 7 discusses this qualitatively; hardware specs and classifier types (SVM, RF, XGBoost, k-NN) are stated, but no actual latency or throughput numbers are provided.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "LLM baselines may suffer evaluation bias due to pretraining on publicly available benchmark datasets, giving LeakSealer a fairness advantage on the novel PII dataset.",
    394       "evidence": "Section 6.2 raises this contamination concern and argues the unpublished PII dataset avoids this bias; the argument is plausible but not empirically tested.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "HDBSCAN exemplars effectively represent full cluster semantics, evidenced by purity and accuracy converging to similar values across all three datasets.",
    399       "evidence": "Table 1 shows purity and accuracy are nearly identical per dataset (e.g., ToxicChat 0.97 purity / 0.96 accuracy; PII 0.77 / 0.75), directly supporting the exemplar representation claim.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "case-study"
    406   ],
    407   "key_findings": "LeakSealer is a semi-supervised framework that clusters LLM interaction embeddings via HDBSCAN and uses Human-In-The-Loop labeling to train lightweight classifiers (SVM, RF, XGBoost, k-NN) for detecting prompt injection and PII leakage in static (forensic) and dynamic (active defense) settings. In the static setting, it achieves cluster purity of 0.97 on ToxicChat and the highest F1-score on both ToxicChat and a custom PII dataset. In the dynamic setting, it achieves AUPRC of 0.97 for PII leakage detection, substantially outperforming Llama Guard 3 (0.84) and all LLM-as-a-Judge baselines. The framework is designed to be model-agnostic and computationally efficient by avoiding per-sample LLM inference. However, the HITL component is fully simulated using ground truth labels in all experiments, the code and dataset were not released at submission, no ablation study was conducted, and no statistical significance or uncertainty quantification is provided.",
    408   "red_flags": [
    409     {
    410       "flag": "HITL simulated, not real",
    411       "detail": "The Human-In-The-Loop pipeline, which is central to the framework's design and differentiation, is evaluated entirely with simulated human feedback using dataset ground truth labels. No actual human annotators participated, making it unknown how real human error rates would affect performance."
    412     },
    413     {
    414       "flag": "No statistical significance testing",
    415       "detail": "All comparative results across multiple datasets and baselines are reported as point estimates with no confidence intervals, error bars, or significance tests, making it impossible to assess whether performance differences are statistically meaningful."
    416     },
    417     {
    418       "flag": "Code and data not released at submission",
    419       "detail": "Despite claiming 'open-source' in the abstract, the paper body uses future tense ('will be publicly available' / 'will open-source') for both LeakSealer code and the PII dataset, indicating neither was available at time of submission."
    420     },
    421     {
    422       "flag": "No ablation study",
    423       "detail": "The framework combines an embedding model, PCA, UMAP, HDBSCAN clustering, exemplar selection, and four candidate classifiers, but no ablation isolates the contribution of any individual component to the performance gains."
    424     },
    425     {
    426       "flag": "Classifier hyperparameters unreported",
    427       "detail": "The dynamic classifier is selected via nested cross-validation from SVM, RF, XGBoost, and k-NN candidates, but the winning model type and its hyperparameters are not disclosed per dataset, preventing reproduction."
    428     },
    429     {
    430       "flag": "GPT-4o version unpinned",
    431       "detail": "GPT-4o is cited as 'GPT-4o, 2025. Large language model.' without a specific API version, snapshot date, or model ID, making exact reproduction impossible for this baseline."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Llama Guard: LLM-Based Input-Output Safeguard for Human-AI Conversations",
    437       "relevance": "Primary baseline for comparison; LeakSealer is evaluated against Llama Guard 3 8B on all three datasets across both static and dynamic settings."
    438     },
    439     {
    440       "title": "ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World User-AI Conversation",
    441       "relevance": "Primary evaluation benchmark for jailbreak and toxic content detection experiments; 10,166 annotated real-world user-AI interactions."
    442     },
    443     {
    444       "title": "The Good and the Bad: Exploring Privacy Issues in Retrieval-Augmented Generation (RAG)",
    445       "relevance": "Directly motivates the PII leakage threat model by demonstrating RAG systems can leak sensitive documents; key prior work this paper builds upon."
    446     },
    447     {
    448       "title": "ProPILE: Probing Privacy Leakage in Large Language Models",
    449       "relevance": "Related work on PII leakage from LLM training data, contrasted with this paper's focus on leakage from RAG retrieved context."
    450     },
    451     {
    452       "title": "Whispers in the Machine: Confidentiality in LLM-Integrated Systems",
    453       "relevance": "Related work framing PII leakage as a secret key retrieval problem; compared as having a narrower experimental scope than this paper's approach."
    454     },
    455     {
    456       "title": "PARDEN, Can You Repeat That? Defending against Jailbreaks via Repetition",
    457       "relevance": "LLM-based defense baseline that LeakSealer claims to improve upon in computational efficiency by avoiding repeated LLM inference."
    458     },
    459     {
    460       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    461       "relevance": "Foundation for the LLM-as-a-Judge baseline approach compared in the evaluation pipeline."
    462     },
    463     {
    464       "title": "HDBSCAN: Hierarchical Density Based Clustering",
    465       "relevance": "Core algorithmic component of LeakSealer's clustering step, used to identify exemplars and outliers from embedding space."
    466     },
    467     {
    468       "title": "A Holistic Approach to Undesired Content Detection in the Real World",
    469       "relevance": "Source of the OpenAI Content Moderation Dataset, one of the two jailbreak evaluation benchmarks used."
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 3,
    475       "justification": "Directly addresses PII leakage in RAG systems and jailbreaking—active real-world security concerns for any LLM deployment, with a lightweight and model-agnostic approach that practitioners could adopt without GPU-intensive LLM inference."
    476     },
    477     "surprise_contrarian": {
    478       "score": 2,
    479       "justification": "The finding that lightweight traditional ML classifiers (SVM, k-NN) on clustered embeddings outperform LLM-based guards (Llama Guard, GPT-4o) on PII detection tasks is moderately counterintuitive."
    480     },
    481     "fear_safety": {
    482       "score": 2,
    483       "justification": "The paper demonstrates concrete PII leakage risks from RAG systems with quantified detection gaps (Llama Guard recall 0.31 on PII), raising legitimate concerns about deployed production LLM systems."
    484     },
    485     "drama_conflict": {
    486       "score": 1,
    487       "justification": "Competitive results showing LeakSealer beating GPT-4o and Llama Guard on PII tasks have some competitive angle, but no major controversy or conflict is present."
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "Code and dataset are promised as future releases but not yet available at submission; once released the framework could be demonstrated, but cannot be tried now."
    492     },
    493     "brand_recognition": {
    494       "score": 1,
    495       "justification": "Authors are from Politecnico di Milano (reputable Italian institution) and ML cube (small industry partner) with no major AI lab involvement."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "44052041",
    502         "title": "Discord Unveiled: A Comprehensive Dataset of Public Communication (2015-2024)",
    503         "points": 152,
    504         "comments": 179,
    505         "url": "https://news.ycombinator.com/item?id=44052041",
    506         "created_at": "2025-05-21T14:45:38Z"
    507       },
    508       {
    509         "hn_id": "24051456",
    510         "title": "UT Dallas Computer Science professor claims to have proven RP = NP",
    511         "points": 88,
    512         "comments": 13,
    513         "url": "https://news.ycombinator.com/item?id=24051456",
    514         "created_at": "2020-08-04T16:20:33Z"
    515       },
    516       {
    517         "hn_id": "42602347",
    518         "title": "Did we miss P In CAP? Partial Progress Conjecture under Asynchrony",
    519         "points": 42,
    520         "comments": 4,
    521         "url": "https://news.ycombinator.com/item?id=42602347",
    522         "created_at": "2025-01-05T15:23:00Z"
    523       },
    524       {
    525         "hn_id": "44176172",
    526         "title": "What do software developers need to know to succeed in an age of AI?",
    527         "points": 18,
    528         "comments": 2,
    529         "url": "https://news.ycombinator.com/item?id=44176172",
    530         "created_at": "2025-06-04T00:30:21Z"
    531       },
    532       {
    533         "hn_id": "44805436",
    534         "title": "Quantum machine learning via vector embeddings",
    535         "points": 11,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=44805436",
    538         "created_at": "2025-08-05T22:46:47Z"
    539       },
    540       {
    541         "hn_id": "43382159",
    542         "title": "Do Emotions Affect Argument Convincingness?",
    543         "points": 4,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=43382159",
    546         "created_at": "2025-03-16T20:48:09Z"
    547       },
    548       {
    549         "hn_id": "44777459",
    550         "title": "Hypertokens: Holographic Associative Memory in Tokenized LLMs",
    551         "points": 3,
    552         "comments": 8,
    553         "url": "https://news.ycombinator.com/item?id=44777459",
    554         "created_at": "2025-08-03T16:00:47Z"
    555       },
    556       {
    557         "hn_id": "41215568",
    558         "title": "Leveraging LLM Reasoning Enhances Personalized Recommender Systems",
    559         "points": 3,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=41215568",
    562         "created_at": "2024-08-11T11:56:10Z"
    563       },
    564       {
    565         "hn_id": "43919128",
    566         "title": "Quantifying the Fermi paradox via passive SETI",
    567         "points": 2,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=43919128",
    570         "created_at": "2025-05-07T18:32:13Z"
    571       },
    572       {
    573         "hn_id": "41196057",
    574         "title": "Deceptive AI is most convincing",
    575         "points": 2,
    576         "comments": 0,
    577         "url": "https://news.ycombinator.com/item?id=41196057",
    578         "created_at": "2024-08-08T20:45:23Z"
    579       }
    580     ],
    581     "top_points": 152,
    582     "total_points": 325,
    583     "total_comments": 206
    584   }
    585 }

Impressum · Datenschutz