scan.json (24835B)
1 { 2 "paper": { 3 "title": "GSPR: Aligning LLM Safeguards as Generalizable Safety Policy Reasoners", 4 "authors": [ 5 "Haoran Li", 6 "Yulin Chen", 7 "Jingru Zeng", 8 "Hao Peng", 9 "Huihao Jing", 10 "Wenbin Hu", 11 "Xi Yang", 12 "Ziqian Zeng", 13 "Sirui Han", 14 "Yangqiu Song" 15 ], 16 "year": 2025, 17 "venue": "arXiv preprint", 18 "arxiv_id": "2509.24418", 19 "doi": "10.48550/arXiv.2509.24418" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper states 'Our reproducible data, code, and model weights will be open-sourced' (Section 1) — future tense, no URL provided." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "Training data is composed of publicly available benchmarks (Aegis, SafeRLHF, BeaverTails, WildGuard, OR-Bench, GUARDSET-X) with specific splits documented in Table 5." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, or detailed environment specification provided. Only mentions '8 NVIDIA H800 graphics cards' and vLLM/VERL packages." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions or README provided. Training details are in Appendix C.2 but no runnable scripts or commands." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Tables 2 and 3 are point estimates (e.g., '85.68' S-Acc) with no confidence intervals or error bars." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims GSPR 'outperforms' and 'significantly improves' baselines based solely on comparing accuracy numbers without any statistical significance tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports percentage improvements with baseline context, e.g., 'more than 45% accuracy improvement in fine-grained category prediction' and 'improves the overall S-Acc from 84% to 86%' (Section 4.2)." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification for test set sizes. Sample sizes are stated in Tables 5 and 6 but never justified." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "Inference uses temperature=0.0 for 'a single run to ensure reproducibility' (Section 4.1). No variance across runs reported." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Extensive baselines including closed-source APIs (o3-mini, Gemini-2.5-Flash), open-source guardrails (ShieldGemma-9B, LlamaGuard3-8B, GuardReasoner-8B), base models (Qwen2.5-7B, Qwen3-8B), and RL-aligned RSafe." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include recent models: o3-mini (2025), Gemini-2.5-Flash (2025), GuardReasoner (2025), RSafe (2025), Qwen3-8B (2025)." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 4.3 presents ablations: GSPR (safety only) vs GSPR w/o Cold-start vs GSPR w/ Cold-start, isolating effects of prompt template, category reward, and cold-start SFT." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Three metrics reported: Safety Accuracy (S-Acc), Safety F1 (S-F1), and Category Accuracy (C-Acc)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation of GSPR's outputs. Case studies in Appendix D are qualitative illustrations, not systematic human evaluation." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Testing uses held-out test splits of training benchmarks (Table 6) plus 4 completely unseen out-of-domain datasets (OpenAI Moderation, HEx-PHI, T2T, Do-Not-Answer)." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Tables 2 and 3 break down results per dataset and per task (prompt safety vs response safety), with S-Acc, S-F1, and C-Acc for each." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 4.4 and Appendix D discuss failure cases including RSafe's conflicting reasoning traces, language mixing issues, and repetition problems." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that 'GSPR w/o Cold-start' underperforms on some metrics, and that GSPR (safety only) on Qwen3 suffers >31% language mixing and >3% repetition (Table 4)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims of improved safety/category prediction and lower inference costs are supported by Tables 2, 3, and 4." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims about components (cold-start, category reward, prompt template) are supported by controlled ablation studies in Section 4.3 that isolate each component." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims 'Generalizable Safety Policy Reasoners' broadly, but out-of-domain evaluation covers only 4 English-language text-based safety benchmarks. No discussion of limits to non-English, multimodal, or domain-specific policies." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No discussion of alternative explanations for results. For example, performance gains could partly stem from increased training data diversity rather than the GRPO mechanism, but this is not explored." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper measures accuracy on benchmark labels (S-Acc, C-Acc) and frames this as 'content moderation performance' and 'safety reasoning capabilities' without discussing the gap between benchmark accuracy and real-world content moderation effectiveness." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Models are named without version snapshots: 'Qwen2.5-7B-Instruct', 'Qwen3-8B', 'Gemini-2.5-Flash', 'o3-mini'. No API dates or checkpoint hashes provided." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt templates for content moderation (Table 8), cold-start annotation (Table 7), and the GSPR training prompt (Figure 1) are provided with actual text." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Appendix C.2 reports: batch size 128, learning rate 1e-7, 5 rollouts, temperature 0.7 for rollout, top_p 0.8, repetition_penalty 1.2, α1=0.55, α2=0.45, max_response_length=1024." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. GSPR is a standard fine-tuned LLM guardrail, not an agent with tools or multi-step reasoning loops." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 3.1 describes the prompt formatting pipeline, sample ratio s for taxonomy sampling, the 'others' category addition. Appendix B.1 details each dataset's collection. Cold-start filtering via regex is documented in Section 3.1 and Appendix C.4." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "No dedicated limitations section. The conclusion mentions future work ('integrate more existing safety benchmarks') but no substantive limitations discussion." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No threats to validity discussed anywhere in the paper." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit scope boundaries. The paper does not state what it did NOT test (e.g., non-English content, multimodal inputs, adversarial jailbreak robustness beyond benchmark items)." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "Training and test data are from publicly available benchmarks (Aegis, SafeRLHF, BeaverTails, WildGuard, OR-Bench, GUARDSET-X, OpenAI Moderation, HEx-PHI, T2T, Do-Not-Answer). Specific splits and sample counts are documented in Tables 5 and 6." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 3.1 and Appendix B describe training data sources, sampling procedures (3,000 safe + 3,000 unsafe per split), and cold-start sample creation (80 per taxonomy, regex filtering to 1,383)." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. All data comes from standard public benchmarks." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline is documented: benchmark collection → sample balancing (Table 5) → prompt formatting with taxonomy sampling → cold-start distillation from Gemini-2.5-Flash → regex filtering → SFT → GRPO alignment." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding source or acknowledgments section found in the paper." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are listed: HKUST, National University of Singapore, South China University of Technology, Beihang University." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "No funding information disclosed, so independence cannot be assessed." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "The paper does not state the training data cutoff dates for Qwen2.5-7B-Instruct or Qwen3-8B base models. These models may have seen safety benchmark data during pre-training." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether the base models' pre-training data includes any of the safety benchmark test sets used for evaluation." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "Several benchmarks (BeaverTails, WildGuard, SafeRLHF) were published before Qwen3's training cutoff. No contamination analysis performed." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Table 4 reports average word counts per response as a proxy for inference cost. The paper explicitly frames this as a cost analysis in Section 4.4, Finding 5." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Section 4.1: 'All experiments are conducted on a node with 8 NVIDIA H800 graphics cards and take approximately 40 days of GPU hours.'" 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No seed sensitivity analysis. Inference uses temperature=0.0 for deterministic output, but training with GRPO involves stochasticity and no seed analysis is reported." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "Results appear to be from a single training run per configuration. Number of runs is not stated." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "Hyperparameters are reported (Appendix C.2) but no search budget or description of how they were selected." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "The final hyperparameter configuration (α1=0.55, α2=0.45, lr=1e-7, etc.) is presented without justification for how it was selected." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "Many comparisons across models, datasets, and metrics without any statistical tests, let alone multiple comparison correction." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "Authors implement their own version of RSafe baseline ('we follow the official implementation') and compare against it without acknowledging self-implementation bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "GSPR requires cold-start SFT + GRPO training (~40 GPU days on H800s) while baselines like ShieldGemma and LlamaGuard have different compute costs. No compute-matched comparison." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether safety benchmark accuracy actually measures real-world content moderation effectiveness. The paper treats benchmark performance as equivalent to safety capability." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding involved. GSPR is a direct LLM inference system, not a scaffolded agent." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether Qwen2.5/Qwen3 pre-training data includes safety benchmark data published before their training cutoffs." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of feature leakage. The safety taxonomy is provided in the prompt, which could leak category information for classification." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of potential overlap between training benchmarks (e.g., BeaverTails and SafeRLHF share similar data sources from Alpaca-generated outputs)." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method applied." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "GSPR achieves state-of-the-art safety prediction performance, improving overall S-Acc from 84% to 86% with cold-start strategy", 374 "evidence": "Table 2 shows GSPR w/ Cold-start achieves 86.36% overall S-Acc on Qwen3-8B, vs 84.00% for RSafe and 83.87% for base Qwen3-8B", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "GSPR yields more than 45% overall C-Acc improvement over RSafe for fine-grained category prediction", 379 "evidence": "Table 2: GSPR w/ Cold-start (Qwen2.5) achieves 78.32% C-Acc vs RSafe's 30.17% — a 48pp gain", 380 "supported": "strong" 381 }, 382 { 383 "claim": "GSPR demonstrates robust generalization to out-of-domain safety taxonomies", 384 "evidence": "Table 3 shows GSPR w/ Cold-start achieves 93.11% overall S-Acc and 79.85% C-Acc on 4 unseen benchmarks (Qwen3-8B)", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "GSPR generates the most efficient safety reasoning traces with the least inference token cost", 389 "evidence": "Table 4: GSPR w/ Cold-start averages 34.10 words (Qwen2.5) and 77.73 words (Qwen3) vs 140+ for other reasoning models", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Cold-start strategy brings more than 20% C-Acc gains under Qwen2.5-7B-Instruct", 394 "evidence": "Table 2: GSPR w/ Cold-start 78.32% vs GSPR w/o Cold-start 54.06% overall C-Acc = 24pp gain", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Format reward effectively eliminates language switching and repetitions", 399 "evidence": "Table 4: GSPR w/ Cold-start reduces Mix% to 0.04-0.06% vs RSafe's 0.68-25.23% on Qwen3", 400 "supported": "strong" 401 } 402 ], 403 "methodology_tags": ["benchmark-eval"], 404 "key_findings": "GSPR proposes a flexible training pipeline that incorporates variable safety taxonomies into guardrail prompts and uses GRPO with cold-start SFT to train a generalizable safety policy reasoner. The approach achieves state-of-the-art performance on both binary safety prediction (~86% S-Acc) and fine-grained category prediction (~78% C-Acc) across 8 safety benchmarks, with 45+pp category accuracy improvement over RSafe. GSPR also generates the most concise reasoning traces (34-78 avg words) while effectively eliminating language mixing and repetition issues that plague other RL-aligned guardrails.", 405 "red_flags": [ 406 { 407 "flag": "No statistical testing", 408 "detail": "All claims of superiority are based on point estimate comparisons without significance tests, confidence intervals, or variance across runs. Single training run with temperature=0.0 inference provides no uncertainty quantification." 409 }, 410 { 411 "flag": "No limitations discussion", 412 "detail": "The paper has no limitations section. Scope boundaries, failure modes in real-world deployment, adversarial robustness, and generalization limits are not discussed." 413 }, 414 { 415 "flag": "Self-implemented baseline", 416 "detail": "RSafe baseline is re-implemented by the authors ('we follow the official implementation') since no open-source weights are available. This introduces self-comparison bias — the authors' implementation of RSafe may not match the original's performance." 417 }, 418 { 419 "flag": "Unfair baseline comparison for C-Acc", 420 "detail": "LlamaGuard3 and GuardReasoner score ~0% C-Acc because they 'fail to follow our instructions' for fine-grained policy output. These models were designed for fixed taxonomies, not flexible ones. The comparison conflates instruction-following ability with safety reasoning capability." 421 }, 422 { 423 "flag": "No contamination analysis", 424 "detail": "Safety benchmarks (BeaverTails 2023, WildGuard 2024, SafeRLHF 2024) were published before Qwen3's training cutoff. Base model performance could partly reflect memorization rather than safety reasoning." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations", 430 "authors": ["Hakan Inan"], 431 "year": 2023, 432 "arxiv_id": "2312.06674", 433 "relevance": "Major LLM safety guardrail baseline, one of the systems GSPR compares against." 434 }, 435 { 436 "title": "GuardReasoner: Towards Reasoning-based LLM Safeguards", 437 "authors": ["Yue Liu"], 438 "year": 2025, 439 "arxiv_id": "2501.18492", 440 "relevance": "Safety guardrail with reasoning capabilities, key baseline for comparing reasoning-based content moderation." 441 }, 442 { 443 "title": "RSafe: Incentivizing Proactive Reasoning to Build Robust and Adaptive LLM Safeguards", 444 "authors": ["Jingnan Zheng"], 445 "year": 2025, 446 "arxiv_id": "2506.07736", 447 "relevance": "RL-aligned safety guardrail using GRPO; primary baseline that GSPR extends and improves upon." 448 }, 449 { 450 "title": "WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs", 451 "authors": ["Seungju Han"], 452 "year": 2024, 453 "relevance": "Open-source safety moderation tool and benchmark used in GSPR's training and evaluation." 454 }, 455 { 456 "title": "BeaverTails: Towards Improved Safety Alignment of LLM via a Human-Preference Dataset", 457 "authors": ["Jiaming Ji"], 458 "year": 2023, 459 "relevance": "Safety benchmark with 14 harm categories used for GSPR training and evaluation." 460 }, 461 { 462 "title": "PKU-SafeRLHF: Towards Multi-Level Safety Alignment for LLMs with Human Preference", 463 "authors": ["Jiaming Ji"], 464 "year": 2024, 465 "arxiv_id": "2406.15513", 466 "relevance": "Large-scale safety benchmark with 19 harm categories used for GSPR training and evaluation." 467 }, 468 { 469 "title": "ShieldGemma: Generative AI Content Moderation Based on Gemma", 470 "authors": ["Wenjun Zeng"], 471 "year": 2024, 472 "arxiv_id": "2407.21772", 473 "relevance": "Content moderation model built on Gemma2, evaluated as a baseline guardrail." 474 }, 475 { 476 "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", 477 "authors": ["Zhihong Shao"], 478 "year": 2024, 479 "arxiv_id": "2402.03300", 480 "relevance": "Introduces GRPO algorithm that GSPR adapts for safety reasoning alignment." 481 }, 482 { 483 "title": "Constitutional AI: Harmlessness from AI Feedback", 484 "authors": ["Paul F Christiano"], 485 "year": 2017, 486 "relevance": "Foundational RLHF work for safety alignment that GSPR builds upon." 487 }, 488 { 489 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 490 "authors": ["Eric Wallace"], 491 "year": 2024, 492 "arxiv_id": "2404.13208", 493 "relevance": "Safety training approach for LLMs using instruction hierarchy, relevant to LLM safety defense mechanisms." 494 }, 495 { 496 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 497 "authors": ["Evan Hubinger"], 498 "year": 2024, 499 "arxiv_id": "2401.05566", 500 "relevance": "Demonstrates persistent backdoor attacks in LLMs that survive safety training, relevant to LLM safety threats." 501 }, 502 { 503 "title": "Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To!", 504 "authors": ["Xiangyu Qi"], 505 "year": 2024, 506 "relevance": "Shows fine-tuning can compromise LLM safety alignment, relevant to safety guardrail evaluation." 507 } 508 ] 509 }