ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24835B)


      1 {
      2   "paper": {
      3     "title": "GSPR: Aligning LLM Safeguards as Generalizable Safety Policy Reasoners",
      4     "authors": [
      5       "Haoran Li",
      6       "Yulin Chen",
      7       "Jingru Zeng",
      8       "Hao Peng",
      9       "Huihao Jing",
     10       "Wenbin Hu",
     11       "Xi Yang",
     12       "Ziqian Zeng",
     13       "Sirui Han",
     14       "Yangqiu Song"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv preprint",
     18     "arxiv_id": "2509.24418",
     19     "doi": "10.48550/arXiv.2509.24418"
     20   },
     21   "scan_version": 2,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states 'Our reproducible data, code, and model weights will be open-sourced' (Section 1) — future tense, no URL provided."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Training data is composed of publicly available benchmarks (Aegis, SafeRLHF, BeaverTails, WildGuard, OR-Bench, GUARDSET-X) with specific splits documented in Table 5."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, or detailed environment specification provided. Only mentions '8 NVIDIA H800 graphics cards' and vLLM/VERL packages."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions or README provided. Training details are in Appendix C.2 but no runnable scripts or commands."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results in Tables 2 and 3 are point estimates (e.g., '85.68' S-Acc) with no confidence intervals or error bars."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims GSPR 'outperforms' and 'significantly improves' baselines based solely on comparing accuracy numbers without any statistical significance tests."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports percentage improvements with baseline context, e.g., 'more than 45% accuracy improvement in fine-grained category prediction' and 'improves the overall S-Acc from 84% to 86%' (Section 4.2)."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification for test set sizes. Sample sizes are stated in Tables 5 and 6 but never justified."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Inference uses temperature=0.0 for 'a single run to ensure reproducibility' (Section 4.1). No variance across runs reported."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Extensive baselines including closed-source APIs (o3-mini, Gemini-2.5-Flash), open-source guardrails (ShieldGemma-9B, LlamaGuard3-8B, GuardReasoner-8B), base models (Qwen2.5-7B, Qwen3-8B), and RL-aligned RSafe."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include recent models: o3-mini (2025), Gemini-2.5-Flash (2025), GuardReasoner (2025), RSafe (2025), Qwen3-8B (2025)."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 4.3 presents ablations: GSPR (safety only) vs GSPR w/o Cold-start vs GSPR w/ Cold-start, isolating effects of prompt template, category reward, and cold-start SFT."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Three metrics reported: Safety Accuracy (S-Acc), Safety F1 (S-F1), and Category Accuracy (C-Acc)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of GSPR's outputs. Case studies in Appendix D are qualitative illustrations, not systematic human evaluation."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Testing uses held-out test splits of training benchmarks (Table 6) plus 4 completely unseen out-of-domain datasets (OpenAI Moderation, HEx-PHI, T2T, Do-Not-Answer)."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Tables 2 and 3 break down results per dataset and per task (prompt safety vs response safety), with S-Acc, S-F1, and C-Acc for each."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 4.4 and Appendix D discuss failure cases including RSafe's conflicting reasoning traces, language mixing issues, and repetition problems."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports that 'GSPR w/o Cold-start' underperforms on some metrics, and that GSPR (safety only) on Qwen3 suffers >31% language mixing and >3% repetition (Table 4)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims of improved safety/category prediction and lower inference costs are supported by Tables 2, 3, and 4."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims about components (cold-start, category reward, prompt template) are supported by controlled ablation studies in Section 4.3 that isolate each component."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Generalizable Safety Policy Reasoners' broadly, but out-of-domain evaluation covers only 4 English-language text-based safety benchmarks. No discussion of limits to non-English, multimodal, or domain-specific policies."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No discussion of alternative explanations for results. For example, performance gains could partly stem from increased training data diversity rather than the GRPO mechanism, but this is not explored."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper measures accuracy on benchmark labels (S-Acc, C-Acc) and frames this as 'content moderation performance' and 'safety reasoning capabilities' without discussing the gap between benchmark accuracy and real-world content moderation effectiveness."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Models are named without version snapshots: 'Qwen2.5-7B-Instruct', 'Qwen3-8B', 'Gemini-2.5-Flash', 'o3-mini'. No API dates or checkpoint hashes provided."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Full prompt templates for content moderation (Table 8), cold-start annotation (Table 7), and the GSPR training prompt (Figure 1) are provided with actual text."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix C.2 reports: batch size 128, learning rate 1e-7, 5 rollouts, temperature 0.7 for rollout, top_p 0.8, repetition_penalty 1.2, α1=0.55, α2=0.45, max_response_length=1024."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. GSPR is a standard fine-tuned LLM guardrail, not an agent with tools or multi-step reasoning loops."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3.1 describes the prompt formatting pipeline, sample ratio s for taxonomy sampling, the 'others' category addition. Appendix B.1 details each dataset's collection. Cold-start filtering via regex is documented in Section 3.1 and Appendix C.4."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No dedicated limitations section. The conclusion mentions future work ('integrate more existing safety benchmarks') but no substantive limitations discussion."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No threats to validity discussed anywhere in the paper."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No explicit scope boundaries. The paper does not state what it did NOT test (e.g., non-English content, multimodal inputs, adversarial jailbreak robustness beyond benchmark items)."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Training and test data are from publicly available benchmarks (Aegis, SafeRLHF, BeaverTails, WildGuard, OR-Bench, GUARDSET-X, OpenAI Moderation, HEx-PHI, T2T, Do-Not-Answer). Specific splits and sample counts are documented in Tables 5 and 6."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3.1 and Appendix B describe training data sources, sampling procedures (3,000 safe + 3,000 unsafe per split), and cold-start sample creation (80 per taxonomy, regex filtering to 1,383)."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data comes from standard public benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline is documented: benchmark collection → sample balancing (Table 5) → prompt formatting with taxonomy sampling → cold-start distillation from Gemini-2.5-Flash → regex filtering → SFT → GRPO alignment."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source or acknowledgments section found in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are listed: HKUST, National University of Singapore, South China University of Technology, Beihang University."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding information disclosed, so independence cannot be assessed."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper does not state the training data cutoff dates for Qwen2.5-7B-Instruct or Qwen3-8B base models. These models may have seen safety benchmark data during pre-training."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the base models' pre-training data includes any of the safety benchmark test sets used for evaluation."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "Several benchmarks (BeaverTails, WildGuard, SafeRLHF) were published before Qwen3's training cutoff. No contamination analysis performed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 4 reports average word counts per response as a proxy for inference cost. The paper explicitly frames this as a cost analysis in Section 4.4, Finding 5."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 4.1: 'All experiments are conducted on a node with 8 NVIDIA H800 graphics cards and take approximately 40 days of GPU hours.'"
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No seed sensitivity analysis. Inference uses temperature=0.0 for deterministic output, but training with GRPO involves stochasticity and no seed analysis is reported."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Results appear to be from a single training run per configuration. Number of runs is not stated."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Hyperparameters are reported (Appendix C.2) but no search budget or description of how they were selected."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The final hyperparameter configuration (α1=0.55, α2=0.45, lr=1e-7, etc.) is presented without justification for how it was selected."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Many comparisons across models, datasets, and metrics without any statistical tests, let alone multiple comparison correction."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Authors implement their own version of RSafe baseline ('we follow the official implementation') and compare against it without acknowledging self-implementation bias."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "GSPR requires cold-start SFT + GRPO training (~40 GPU days on H800s) while baselines like ShieldGemma and LlamaGuard have different compute costs. No compute-matched comparison."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether safety benchmark accuracy actually measures real-world content moderation effectiveness. The paper treats benchmark performance as equivalent to safety capability."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding involved. GSPR is a direct LLM inference system, not a scaffolded agent."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether Qwen2.5/Qwen3 pre-training data includes safety benchmark data published before their training cutoffs."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of feature leakage. The safety taxonomy is provided in the prompt, which could leak category information for classification."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of potential overlap between training benchmarks (e.g., BeaverTails and SafeRLHF share similar data sources from Alpaca-generated outputs)."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method applied."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GSPR achieves state-of-the-art safety prediction performance, improving overall S-Acc from 84% to 86% with cold-start strategy",
    374       "evidence": "Table 2 shows GSPR w/ Cold-start achieves 86.36% overall S-Acc on Qwen3-8B, vs 84.00% for RSafe and 83.87% for base Qwen3-8B",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "GSPR yields more than 45% overall C-Acc improvement over RSafe for fine-grained category prediction",
    379       "evidence": "Table 2: GSPR w/ Cold-start (Qwen2.5) achieves 78.32% C-Acc vs RSafe's 30.17% — a 48pp gain",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "GSPR demonstrates robust generalization to out-of-domain safety taxonomies",
    384       "evidence": "Table 3 shows GSPR w/ Cold-start achieves 93.11% overall S-Acc and 79.85% C-Acc on 4 unseen benchmarks (Qwen3-8B)",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "GSPR generates the most efficient safety reasoning traces with the least inference token cost",
    389       "evidence": "Table 4: GSPR w/ Cold-start averages 34.10 words (Qwen2.5) and 77.73 words (Qwen3) vs 140+ for other reasoning models",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Cold-start strategy brings more than 20% C-Acc gains under Qwen2.5-7B-Instruct",
    394       "evidence": "Table 2: GSPR w/ Cold-start 78.32% vs GSPR w/o Cold-start 54.06% overall C-Acc = 24pp gain",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Format reward effectively eliminates language switching and repetitions",
    399       "evidence": "Table 4: GSPR w/ Cold-start reduces Mix% to 0.04-0.06% vs RSafe's 0.68-25.23% on Qwen3",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": ["benchmark-eval"],
    404   "key_findings": "GSPR proposes a flexible training pipeline that incorporates variable safety taxonomies into guardrail prompts and uses GRPO with cold-start SFT to train a generalizable safety policy reasoner. The approach achieves state-of-the-art performance on both binary safety prediction (~86% S-Acc) and fine-grained category prediction (~78% C-Acc) across 8 safety benchmarks, with 45+pp category accuracy improvement over RSafe. GSPR also generates the most concise reasoning traces (34-78 avg words) while effectively eliminating language mixing and repetition issues that plague other RL-aligned guardrails.",
    405   "red_flags": [
    406     {
    407       "flag": "No statistical testing",
    408       "detail": "All claims of superiority are based on point estimate comparisons without significance tests, confidence intervals, or variance across runs. Single training run with temperature=0.0 inference provides no uncertainty quantification."
    409     },
    410     {
    411       "flag": "No limitations discussion",
    412       "detail": "The paper has no limitations section. Scope boundaries, failure modes in real-world deployment, adversarial robustness, and generalization limits are not discussed."
    413     },
    414     {
    415       "flag": "Self-implemented baseline",
    416       "detail": "RSafe baseline is re-implemented by the authors ('we follow the official implementation') since no open-source weights are available. This introduces self-comparison bias — the authors' implementation of RSafe may not match the original's performance."
    417     },
    418     {
    419       "flag": "Unfair baseline comparison for C-Acc",
    420       "detail": "LlamaGuard3 and GuardReasoner score ~0% C-Acc because they 'fail to follow our instructions' for fine-grained policy output. These models were designed for fixed taxonomies, not flexible ones. The comparison conflates instruction-following ability with safety reasoning capability."
    421     },
    422     {
    423       "flag": "No contamination analysis",
    424       "detail": "Safety benchmarks (BeaverTails 2023, WildGuard 2024, SafeRLHF 2024) were published before Qwen3's training cutoff. Base model performance could partly reflect memorization rather than safety reasoning."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    430       "authors": ["Hakan Inan"],
    431       "year": 2023,
    432       "arxiv_id": "2312.06674",
    433       "relevance": "Major LLM safety guardrail baseline, one of the systems GSPR compares against."
    434     },
    435     {
    436       "title": "GuardReasoner: Towards Reasoning-based LLM Safeguards",
    437       "authors": ["Yue Liu"],
    438       "year": 2025,
    439       "arxiv_id": "2501.18492",
    440       "relevance": "Safety guardrail with reasoning capabilities, key baseline for comparing reasoning-based content moderation."
    441     },
    442     {
    443       "title": "RSafe: Incentivizing Proactive Reasoning to Build Robust and Adaptive LLM Safeguards",
    444       "authors": ["Jingnan Zheng"],
    445       "year": 2025,
    446       "arxiv_id": "2506.07736",
    447       "relevance": "RL-aligned safety guardrail using GRPO; primary baseline that GSPR extends and improves upon."
    448     },
    449     {
    450       "title": "WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs",
    451       "authors": ["Seungju Han"],
    452       "year": 2024,
    453       "relevance": "Open-source safety moderation tool and benchmark used in GSPR's training and evaluation."
    454     },
    455     {
    456       "title": "BeaverTails: Towards Improved Safety Alignment of LLM via a Human-Preference Dataset",
    457       "authors": ["Jiaming Ji"],
    458       "year": 2023,
    459       "relevance": "Safety benchmark with 14 harm categories used for GSPR training and evaluation."
    460     },
    461     {
    462       "title": "PKU-SafeRLHF: Towards Multi-Level Safety Alignment for LLMs with Human Preference",
    463       "authors": ["Jiaming Ji"],
    464       "year": 2024,
    465       "arxiv_id": "2406.15513",
    466       "relevance": "Large-scale safety benchmark with 19 harm categories used for GSPR training and evaluation."
    467     },
    468     {
    469       "title": "ShieldGemma: Generative AI Content Moderation Based on Gemma",
    470       "authors": ["Wenjun Zeng"],
    471       "year": 2024,
    472       "arxiv_id": "2407.21772",
    473       "relevance": "Content moderation model built on Gemma2, evaluated as a baseline guardrail."
    474     },
    475     {
    476       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    477       "authors": ["Zhihong Shao"],
    478       "year": 2024,
    479       "arxiv_id": "2402.03300",
    480       "relevance": "Introduces GRPO algorithm that GSPR adapts for safety reasoning alignment."
    481     },
    482     {
    483       "title": "Constitutional AI: Harmlessness from AI Feedback",
    484       "authors": ["Paul F Christiano"],
    485       "year": 2017,
    486       "relevance": "Foundational RLHF work for safety alignment that GSPR builds upon."
    487     },
    488     {
    489       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    490       "authors": ["Eric Wallace"],
    491       "year": 2024,
    492       "arxiv_id": "2404.13208",
    493       "relevance": "Safety training approach for LLMs using instruction hierarchy, relevant to LLM safety defense mechanisms."
    494     },
    495     {
    496       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    497       "authors": ["Evan Hubinger"],
    498       "year": 2024,
    499       "arxiv_id": "2401.05566",
    500       "relevance": "Demonstrates persistent backdoor attacks in LLMs that survive safety training, relevant to LLM safety threats."
    501     },
    502     {
    503       "title": "Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To!",
    504       "authors": ["Xiangyu Qi"],
    505       "year": 2024,
    506       "relevance": "Shows fine-tuning can compromise LLM safety alignment, relevant to safety guardrail evaluation."
    507     }
    508   ]
    509 }

Impressum · Datenschutz