ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30339B)


      1 {
      2   "paper": {
      3     "title": "AEGIS2.0: A Diverse AI Safety Dataset and Risks Taxonomy for Alignment of LLM Guardrails",
      4     "authors": [
      5       "Shaona Ghosh",
      6       "Prasoon Varshney",
      7       "Makesh Narsimhan Sreedhar",
      8       "Aishwarya Padmakumar",
      9       "Traian Rebedea",
     10       "Jibin Rajan Varghese",
     11       "Christopher Parisien"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv preprint",
     15     "arxiv_id": "2501.09004"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The paper states in the Ethics section: 'We will soon release our dataset and models under a commercial-permissive license.' This is a promise of future release, not an actual release. The paper references the llama-recipes repository for training but no code for the AEGIS2.0 pipeline itself is released."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The abstract states 'We plan to open-source AEGIS2.0 data and models to the research community,' and the Ethics section confirms access 'will be carefully monitored.' At the time of the paper, the dataset was not yet released — this is a stated intention, not an actual release."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Appendix A.4.4 specifies the training environment: 8 x A100 GPUs with PyTorch FSDP, the llama-recipes repository, LoRA r=16, alpha=32, batch size=4, learning rate 1e-4 with CosineAnnealingWarmRestarts scheduler. This is sufficient to recreate the training environment."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "While hyperparameters and hardware are described, no step-by-step reproduction instructions are provided. The dataset and code are not yet released, making reproduction impossible regardless of instructions. No README or script-level guidance is given."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Table 7 reports standard deviations in parentheses for all results across the ablation study (e.g., 'catdesc prompt: 0.761(0.005)'). Table 3 mentions results are averaged over three runs, and Table 7 explicitly confirms standard deviations over 3 random seeds."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No statistical significance tests are used when claiming that LLAMA3.1-AEGISGUARD outperforms baselines like LLAMAGUARD3-8B. Comparisons are made based on F1 score differences with no p-values, t-tests, or other statistical tests."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper reports raw F1 scores for each model and condition but does not compute formal effect sizes (Cohen's d, odds ratios, percentage improvement). The reader can calculate differences from Table 3, but the paper itself does not explicitly report effect sizes or contextualize the magnitude of differences."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The test split of 1,984 samples is described as selected via stratified sampling (Section 4) but no power analysis or justification for this specific sample size is provided. The choice of 3 random seeds for averaging is also not justified."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Table 7 reports standard deviations across 3 random seed trials for all ablation results (e.g., 'catlist+ prompt: 0.759(0.009)'). The paper explicitly states 'Mean harmfulness F1 scores reported over 3 random seeds with standard deviation in parenthesis.'"
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Multiple baselines are compared including LLAMAGUARD3-8B, LLAMAGUARD3-1B, LLAMAGUARD2-8B, OPENAI MOD API, BEAVERDAM, and WILDGUARD in Tables 3 and 4."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "All baselines are from 2023-2024, contemporary with the paper (2025). LLAMAGUARD3-8B (Llama 3.1 family, 2024), WILDGUARD (2024), and ShieldGemma (2024) are among the most recent models in this space."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "A thorough ablation study in Table 7 (Appendix A.5) isolates the effect of prompt formats (catdesc/catlist/catlist+), refusal data, LLM jury labels vs. conversation labels, and the source of response labels. Section 5 and Appendix A.5 describe these ablations."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper reports harmfulness F1 scores across multiple benchmarks (WILDGUARDTEST, XSTest, OpenAI Moderation Dataset) and also category prediction accuracy (94% reported in Section 5.1). Both binary safety and multi-class category prediction are evaluated."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The paper evaluates entirely via automated benchmarks (F1 against ground truth labels on WGTEST, XSTEST, OAI Mod). Human evaluation of the model's safety classifications on novel inputs is not performed. For a content moderation model, human evaluation of outputs is relevant — the model's safety judgments could be assessed by human reviewers."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 4 explicitly describes: 'we provide a train-test split of the dataset, by selecting 1,984 samples for testing via stratified sampling.' Models are evaluated on the held-out AEGIS2.0 test set (Table 4) and on external out-of-domain benchmarks (Table 3)."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5.1 and Appendix A.6 provide per-category breakdowns including heatmaps (Figure 1) showing ground truth vs. predictions across all OpenAI taxonomy categories, and Figure 4 showing WILDGUARDTEST category distributions."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No qualitative examples of model misclassifications are shown. No error analysis of specific cases where AEGISGUARD fails. The Limitations section discusses dataset-level weaknesses (single response model, category imbalance) but not specific failure patterns of the trained model. The heatmap in Figure 1 shows some misclassification patterns at the category level but no qualitative failure cases."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The ablation study (Table 7) shows that the catdesc prompt style 'is not consistently beneficial over the catlist and catlist+ style prompts' (Section A.5.1), which is an unexpected and negative result explicitly acknowledged. The paper also reports that adding refusal data does not consistently improve all metrics."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims that lightweight models trained on AEGIS2.0 achieve performance competitive with WILDGUARD. Tables 3 and 4 support this: LLAMA3.1-AEGISGUARD achieves 0.808 vs. WILDGUARD's 0.828 unweighted average. The abstract appropriately uses 'competitive with' rather than 'surpasses.'"
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper makes causal claims through ablation studies (e.g., 'adding fine-grained categories improves performance') which are justified by controlled single-variable ablations in Table 7. The ablation design manipulates one variable at a time (prompt format, refusal data, jury labels), allowing causal attribution."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper is appropriately bounded: it tests LLAMA3.1-8B-INSTRUCT as the base model and evaluates on English-language safety data. The Limitations section explicitly notes the English-only focus and single response model (Mistral-7B-v0.1). The paper does not claim universal coverage."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not discuss substantive alternative explanations for its main results. The competitive performance with WILDGUARD could be partly explained by base model knowledge (zero-shot baseline is already 0.738), benchmark contamination, or overlap between AEGIS2.0 training sources and test benchmarks. The catdesc discussion is about an internal design choice, not an alternative explanation for the main findings. The Limitations section mentions LLM jury bias but does not connect it to explaining experimental outcomes."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Model versions are specified throughout: LLAMA3.1-8B-INSTRUCT, LLAMAGUARD3-8B, LLAMAGUARD3-1B, LLAMAGUARD2-8B, Mixtral-8x22B-v0.1, Mistral-NeMo, Gemma-2-27B-it, Mistral-7B-v0.1. These are specific named versions, not just generic model families."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Appendix A.12 provides the full system prompt templates (catlist, catlist+, catdesc styles) with complete text including taxonomy definitions and output format. The {prompt} and {response} placeholders represent input data to classify (user messages and LLM responses), not prompt engineering decisions. The actual prompt engineering — system instructions, taxonomy, output format — is fully specified. The annotator prompt templates for the LLM jury are also shown in Figure 8."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix A.4.4 reports: AnyPrecisionAdamW optimizer, initial learning rate 1e-4, CosineAnnealingWarmRestarts with T_0=0.2*training_length and T_mult=2, LoRA r=16 alpha=32, 3-4 epochs, batch size 4, 8 x A100 GPUs with PyTorch FSDP."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used in this work. The trained AEGISGUARD model is a single-pass classifier, not an agent with tools, retry logic, or memory. The jury-of-LLMs is a data labeling pipeline (ensemble voting), not agentic scaffolding."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4 documents the data pipeline: prompt sources (Anthropic/hh-rlhf, DAN, AART, Do-Not-Answer), response generation with Mistral-7B-v0.1, human annotation at dialogue level, jury-of-LLMs for response labels, and refusal data generation. Section 4.1 describes the annotation QA process with 10-15% auditing per chunk."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8 is explicitly titled 'Limitations' and provides substantive discussion covering: single response model limitation, category imbalance, LLM jury bias, English-only coverage, and annotator bias. This is a dedicated section with multi-paragraph discussion."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 8 discusses specific threats: 'use of the LLM-Jury annotations introduces potential biases inherent to the models themselves' with specific reference to gender, race, and cultural biases; underrepresentation of Sexual (minor) and Threat categories shown by Figure 7 distribution; 12 US-based annotators may not represent global cultural norms."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 8 explicitly states 'AEGIS2.0 also lacks robust multilingual support' and 'the dataset primarily focuses on English-language data.' Section A.1 states 'We do not claim that our taxonomy and safety policy are comprehensive, and that the model trained with this mitigates all potential risks.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The dataset is not yet released at the time of the paper. The Ethics section states 'We will soon release our dataset and models' with monitored access. No current download link or archive is provided for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4 describes data collection thoroughly: prompts sourced from Anthropic/hh-rlhf, DAN (Shen et al., 2024), AART (Radharapu et al., 2023), and Do-Not-Answer (Wang et al., 2023). Responses generated using Mistral-7B-v0.1 and Gemma-2-27B for refusals. Section 4.1 details the annotation team of 12 annotators."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The Ethics section describes annotator selection: 12 annotators in the US from various ethnic/religious backgrounds, 4 from engineering backgrounds and 8 from creative writing. Annotators 'were asked to join on a volunteer basis based on their skill level, availability, and willingness to expose themselves to potentially toxic content.'"
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The full pipeline is documented: prompt selection from source datasets → Mistral-7B-v0.1 response generation → dialogue-level human annotation (3 annotations per sample) → QA auditing (10-15% per chunk) → jury-of-LLMs for response-level labels → majority voting → ternary-to-binary label conversion. Final dataset: 34,248 samples including 16,880 standalone prompts and 17,368 prompt-response pairs."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding disclosure, acknowledgments section, or grants are mentioned in the paper. All authors are affiliated with NVIDIA, but no explicit funding statement is provided."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All seven authors list NVIDIA as their affiliation on the title page. The paper evaluates LLM safety models, not NVIDIA's own commercial products specifically, though NVIDIA has commercial interest in LLM safety tooling."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "All authors are NVIDIA employees, and NVIDIA has commercial interest in LLM safety products (including NeMo Guardrails, referenced in the paper). The work directly develops tools that NVIDIA may commercialize, creating a non-independent relationship between the organization and the outcome."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper. Given all authors are NVIDIA employees working on a product NVIDIA intends to release commercially, the absence of any financial interests disclosure is a gap."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper evaluates models including LLAMA3.1-8B-INSTRUCT and other LLMs on safety benchmarks, but does not state the training data cutoff for any of these models. The evaluation benchmarks (WILDGUARDTEST, XSTest, OpenAI Moderation Dataset) may have been in the training data of base models."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No analysis of potential train/test overlap is provided. LLAMA3.1-8B-INSTRUCT is used as a base model, and benchmarks like XSTest (2023) and the OpenAI Moderation Dataset (2023) may have been in the pre-training corpus of Llama 3.1, but this is not discussed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper does not address whether XSTest or the OpenAI Moderation Dataset were available before the training cutoff of LLAMA3.1-8B-INSTRUCT. The base model's training data composition is not discussed in relation to the evaluation benchmarks."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The annotators are professional data labelers performing a commercial annotation task, not research participants in a study. The paper studies the resulting dataset and models, not annotator behavior. Pre-registration applies to studies where hypotheses concern human behavior."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "The annotators are professional workers performing data labeling, not research subjects. The ethical safeguards described (Adult Content Acknowledgement, wellbeing checks) are occupational health measures, not IRB-governed research protocols. IRB applies to research involving human subjects."
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "The Ethics section reports: 12 annotators, all US-based, various ethnic and religious backgrounds, 4 from engineering backgrounds and 8 from creative writing/linguistics. Gender distribution is not explicitly reported, but age and social status diversity are mentioned."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "The Ethics section states annotators 'were asked to join on a volunteer basis based on their skill level, availability, and willingness to expose themselves to potentially toxic content.' This describes inclusion criteria. All must have 'been extensively trained in working with Large Language Models.'"
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "The human study is an annotation task, not an experimental study with condition assignment. Randomization of condition assignment is not applicable as all annotators perform the same annotation task."
    266       },
    267       "blinding_described": {
    268         "applies": true,
    269         "answer": true,
    270         "justification": "The Ethics section states: 'This tooling technology allows for large sets of data to be analyzed by individual annotators without seeing the work of their peers. This is essential in preventing bias between annotators.' Each annotator is blinded to others' annotations, with Label Studio used to enforce this."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper states they 'averaged twelve annotators at any given time' over six months but does not report how many annotators started, whether any dropped out, or attrition rates. The 'at any given time' phrasing implies there may have been turnover that is not explicitly reported."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost (API cost, tokens consumed, latency) is reported for either the jury-of-LLMs labeling system or the trained LLAMA3.1-AEGISGUARD model. Training time is mentioned (15 minutes per epoch on 8 x A100s) but inference cost is absent."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Appendix A.4.4 states: 'we used 8 x A100 GPUs with PyTorch FSDP enabled, with a batch size of 4 and packing enabled. The training time is about 15 minutes per epoch on this setup.' This gives a compute budget estimate for reproduction."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Parameter-efficient fine-tuning on AEGIS2.0 using LLAMA3.1-8B-INSTRUCT surpasses LLAMAGUARD3-8B and performs at par with the state-of-the-art WILDGUARD model.",
    294       "evidence": "Table 3 shows LLAMA3.1-AEGISGUARD achieves 0.808 unweighted average F1 vs. LLAMAGUARD3-8B (0.764) and WILDGUARD (0.828). Mean over 3 random seeds.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Including fine-grained risk categories in the prompt template improves binary safe/unsafe classification performance compared to core categories alone.",
    299       "evidence": "Table 7 (Appendix A.5.1): catlist+ (0.803) outperforms catlist (0.787) on unweighted average F1. The paper attributes this to fine-grained risks like phishing and malware that appear in WILDGUARDMIX but not in core categories.",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "Using jury-of-LLM annotations for response labels substantially boosts response classification performance over using conversation-level annotations.",
    304       "evidence": "Table 7: removing LLM Jury labels drops response classification F1 on WGTEST from 0.771 to 0.511 and XSTest from 0.847 to 0.521 — a dramatic degradation.",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "Training on a combined blend of topic-following and safety data improves adaptability to new risk categories not seen during training.",
    309       "evidence": "Table 5: LLAMA3.1-AEGISGUARD + TF improves on all four new categories (Financial 0.748 vs. 0.722, Legal 0.890 vs. 0.875, Medical 0.941 vs. 0.895, NSFW 0.772 vs. 0.699) compared to the base model.",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "AEGIS2.0 is the first content moderation training dataset fully suitable for commercial use.",
    314       "evidence": "Table 1 shows AEGIS2.0 is the only dataset among XSTest, OpenAI Mod, HarmBench, ToxicChat, WILDGUARDMIX, and BeaverTails to have commercial use for training. The reasoning for others' ineligibility is provided in Section 2.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "benchmark-eval",
    320     "qualitative"
    321   ],
    322   "key_findings": "AEGIS2.0 is a 34,248-sample LLM safety dataset with human annotations across a 12-category taxonomy (plus 9 fine-grained subcategories), designed for commercial use. A lightweight model (LLAMA3.1-AEGISGUARD) fine-tuned via PEFT on this dataset achieves F1 scores competitive with WILDGUARD on standard safety benchmarks while using only open-source, commercially licensed models for weak supervision. Adding topic-following training data improves robustness and zero-shot generalization to novel safety categories not seen during training. Ablation studies show that including fine-grained risk categories in the prompt taxonomy and using LLM jury labels for response annotations are both important for strong performance.",
    323   "red_flags": [
    324     {
    325       "flag": "self-evaluation by product developer",
    326       "detail": "All seven authors are NVIDIA employees, and the paper develops safety tools that NVIDIA plans to release commercially (under monitored access). There is no competing interests statement. The evaluation shows NVIDIA's model favorably against competitors, but this conflict is not disclosed."
    327     },
    328     {
    329       "flag": "dataset not yet released at publication",
    330       "detail": "The abstract states 'We plan to open-source AEGIS2.0 data and models' and the Ethics section says 'We will soon release our dataset and models.' Independent verification of the dataset quality, annotation accuracy, and category distribution is impossible until the data is actually released."
    331     },
    332     {
    333       "flag": "no significance testing for competitive comparisons",
    334       "detail": "The key claim that LLAMA3.1-AEGISGUARD 'surpasses LLAMAGUARD3-8B' is based on F1 score differences (0.808 vs. 0.764) without statistical significance tests. Given the variance reported in Table 7, some of these differences may not be statistically robust."
    335     },
    336     {
    337       "flag": "benchmark contamination not addressed",
    338       "detail": "The paper uses LLAMA3.1-8B-INSTRUCT as a base model and evaluates on XSTest (2023) and OpenAI Moderation Dataset (2023). Both benchmarks were publicly available before Llama 3.1's training cutoff, but no contamination analysis is provided. The zero-shot baseline in Table 7 (0.738 unweighted average) suggests the base model has substantial prior knowledge of these benchmarks."
    339     },
    340     {
    341       "flag": "single response model limits dataset diversity",
    342       "detail": "All non-refusal responses in the dataset are generated by a single model (Mistral-7B-v0.1). The paper acknowledges this in Section 8 but does not test how models trained on this data perform when the deployment LLM differs significantly from Mistral-7B-v0.1 in its response characteristics."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    348       "authors": [
    349         "Hakan Inan",
    350         "Kartikeya Upasani",
    351         "Jianfeng Chi"
    352       ],
    353       "year": 2023,
    354       "arxiv_id": "2312.06674",
    355       "relevance": "Foundational content moderation LLM that AEGIS2.0 directly competes with and improves upon."
    356     },
    357     {
    358       "title": "WildGuard: Open one-stop moderation tools for safety risks, jailbreaks, and refusals of LLMs",
    359       "authors": [
    360         "Seungju Han",
    361         "Kavel Rao",
    362         "Allyson Ettinger"
    363       ],
    364       "year": 2024,
    365       "arxiv_id": "2406.18495",
    366       "relevance": "Primary baseline dataset and model that AEGIS2.0 is compared against throughout the paper."
    367     },
    368     {
    369       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    370       "authors": [
    371         "Jiaming Ji",
    372         "Mickel Liu",
    373         "Josef Dai"
    374       ],
    375       "year": 2024,
    376       "relevance": "Comparable human-annotated safety dataset with 14-category taxonomy that AEGIS2.0 improves upon in commercial usability."
    377     },
    378     {
    379       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    380       "authors": [
    381         "Mantas Mazeika",
    382         "Long Phan",
    383         "Xuwang Yin"
    384       ],
    385       "year": 2024,
    386       "arxiv_id": "2402.04249",
    387       "relevance": "Safety evaluation benchmark used for comparison context in the related work section."
    388     },
    389     {
    390       "title": "ToxicChat: Unveiling hidden challenges of toxicity detection in real-world user-AI conversation",
    391       "authors": [
    392         "Zi Lin",
    393         "Zihan Wang",
    394         "Yongqi Tong"
    395       ],
    396       "year": 2023,
    397       "arxiv_id": "2310.17389",
    398       "relevance": "Earlier content moderation training dataset that AEGIS2.0 addresses limitations of regarding commercial licensing."
    399     },
    400     {
    401       "title": "ShieldGemma: Generative AI content moderation based on Gemma",
    402       "authors": [
    403         "Wenjun Zeng",
    404         "Yuchi Liu",
    405         "Ryan Mullins"
    406       ],
    407       "year": 2024,
    408       "arxiv_id": "2407.21772",
    409       "relevance": "Contemporary content moderation model trained on closed dataset, representing the state of the art AEGIS2.0 competes with."
    410     },
    411     {
    412       "title": "XSTest: A test suite for identifying exaggerated safety behaviours in large language models",
    413       "authors": [
    414         "Paul Röttger",
    415         "Hannah Rose Kirk",
    416         "Bertie Vidgen"
    417       ],
    418       "year": 2023,
    419       "arxiv_id": "2308.01263",
    420       "relevance": "Evaluation benchmark used to test for over-refusal behavior in safety models."
    421     },
    422     {
    423       "title": "NeMo Guardrails: A toolkit for controllable and safe LLM applications with programmable rails",
    424       "authors": [
    425         "Traian Rebedea",
    426         "Razvan Dinu",
    427         "Makesh Sreedhar"
    428       ],
    429       "year": 2023,
    430       "arxiv_id": "2310.10501",
    431       "relevance": "Production guardrails framework that AEGIS2.0 is designed to support, motivating the need for categorical predictions."
    432     },
    433     {
    434       "title": "AEGIS: Online adaptive AI content safety moderation with ensemble of LLM experts",
    435       "authors": [
    436         "Shaona Ghosh",
    437         "Prasoon Varshney",
    438         "Erick Galinkin",
    439         "Christopher Parisien"
    440       ],
    441       "year": 2024,
    442       "arxiv_id": "2404.05993",
    443       "relevance": "Direct predecessor paper (AEGIS 1.0) that AEGIS2.0 extends with larger dataset, improved taxonomy, and better models."
    444     },
    445     {
    446       "title": "Introducing v0.5 of the AI safety benchmark from MLCommons",
    447       "authors": [
    448         "Bertie Vidgen",
    449         "Adarsh Agrawal"
    450       ],
    451       "year": 2024,
    452       "arxiv_id": "2404.12241",
    453       "relevance": "Industry safety benchmark that informed the AEGIS2.0 taxonomy design."
    454     },
    455     {
    456       "title": "Do-Not-Answer: A dataset for evaluating safeguards in LLMs",
    457       "authors": [
    458         "Yuxia Wang",
    459         "Haonan Li",
    460         "Xudong Han"
    461       ],
    462       "year": 2023,
    463       "arxiv_id": "2308.13387",
    464       "relevance": "One of the source datasets from which prompts were sampled to build AEGIS2.0."
    465     },
    466     {
    467       "title": "AART: AI-assisted red-teaming with diverse data generation for new LLM-powered applications",
    468       "authors": [
    469         "Bhaktipriya Radharapu",
    470         "Kevin Robinson",
    471         "Lora Aroyo"
    472       ],
    473       "year": 2023,
    474       "arxiv_id": "2311.08592",
    475       "relevance": "Source of adversarial prompts incorporated into the AEGIS2.0 dataset."
    476     }
    477   ]
    478 }

Impressum · Datenschutz