scan.json (30249B)
1 { 2 "paper": { 3 "title": "Inference-Only Prompt Projection for Safe Text-to-Image Generation with TV Guarantees", 4 "authors": [ 5 "Minhyuk Lee", 6 "Hyekyung Yoon", 7 "Myungjoo Kang" 8 ], 9 "year": 2026, 10 "venue": "arXiv", 11 "arxiv_id": "2602.00616" 12 }, 13 "scan_version": 2, 14 "active_modules": ["experimental_rigor", "data_leakage"], 15 "methodology_tags": ["benchmark-eval", "theoretical"], 16 "key_findings": "The paper formalizes the Safety-Prompt Alignment Trade-off (SPAT) in total variation, proving that any nontrivial safety gain under a fixed reference model requires distributional deviation. Building on this, it proposes an inference-only prompt projection framework using a two-stage LLM/VLM cascade that achieves 16.7-60.0% relative reductions in inappropriate percentage versus model-level alignment baselines across three Stable Diffusion backbones and four datasets, while preserving benign prompt-image alignment near the unaligned reference on COCO. The tolerance threshold τ provides monotonic control over the safety-utility trade-off.", 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No code repository URL is provided. Appendix F mentions 'we persist the resulting index list alongside our code release' implying future release, but no working link or archive is given." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "All evaluation datasets are publicly available. Appendix F provides specific HuggingFace/Zenodo URLs for CoProV2, I2P, UD, and COCO, and documents the exact subsampling protocol (seed=42) for reproducibility." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "Appendix C.2 states '8 NVIDIA H100 GPUs (80GB each)' but provides no requirements.txt, Dockerfile, library versions, or framework versions. Hardware alone is insufficient to recreate the software environment." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithms and appendices describe the method but not how to run it end-to-end." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Table 1 and all other results tables report only point estimates (e.g., IP=0.04, FID=32.46) with no confidence intervals, error bars, or ± notation." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper claims its method 'achieves the lowest inappropriate percentage' and 'outperforming strong baselines' based solely on comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper reports relative reductions with baseline context: '16.7–60.0% relative reductions in IP versus strong model-level alignment baselines' (abstract/§5.2), and Table 1 provides absolute values for both the proposed method and all baselines." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No justification is given for dataset sizes. The COCO-3K subset is described as 'randomly sampling 3,000 safe captions' (§5.1) with no rationale for why 3,000 is sufficient. CoProV2 fitting subset of 1,000 is also unjustified." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No variance, standard deviation, or spread measures are reported. Appendix D.4 explicitly states 'The reported numbers correspond to a single-seed run' for the A/B ablation, and main results appear to be single-run as well." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Table 1 compares against 11 baselines on SD1.5: SLD, ESD-u, UCE, AlignGuard, VALOR, POSI, PNO, PromptGuard, SAFREE, LatentGuard, and GuardT2I. SD2.1 and SDXL compare against 3 representative baselines." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "Baselines include recent works: AlignGuard (2025), VALOR (2025), PromptGuard (2025), SAFREE (2024), PNO (2024), LatentGuard (2024). The oldest baseline (SLD, 2023) is only 3 years old." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "Appendix D.1 provides hyperparameter sensitivity ablations (log-probability threshold, α_safety, number of candidates, local search iterations). Appendix D.2 studies LLM scaling across model sizes. Appendix D.4 ablates A/B label ordering. Appendix D.5 ablates embedding model choice." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "Three metrics are used: inappropriate percentage (IP) for safety, FID for image quality, and CLIPScore for prompt-image alignment (§5.1, Table 1)." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation is conducted. All safety and quality assessments use automated detectors (Q16, NudeNet, CLIP). Human evaluation would be relevant to validate whether prompt rewrites preserve meaning and whether safety classifications align with human judgment." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "CoProV2 has explicit train/test splits (15,690/8,000 pairs). §5.1 states 'All model selection (including hyperparameter tuning) is performed only on CoProV2.' I2P and UD serve as out-of-distribution test sets." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Table 2 provides category-wise IP scores across 7 safety categories (Shocking, Self-harm, Sexual, Illegal, Hate, Violence, Harassment) under four adversarial attacks." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "§4.4 explicitly discusses failure modes: 'When unsafe content is inseparable from the core intent, meaning preservation may conflict with meeting τ.' The impact statement discusses residual harms from imperfect evaluators and rewriting." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": false, 111 "justification": "Every experiment shows the method performing well. The hyperparameter ablation (Appendix D.1) shows only minor sensitivity variations. No configurations that failed or approaches that were tried and abandoned are reported." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims '16.7–60.0% relative reductions in IP versus strong model-level alignment baselines' which is supported by Table 1 across SD1.5/SD2.1/SDXL. The claim of 'preserving benign prompt–image alignment on COCO near the unaligned reference' is supported by FID/CLIP numbers in Table 1." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "Causal claims ('prompt projection reduces IP') are supported by controlled comparisons (same backbone, different alignment methods) and ablation studies that vary individual components (§D.1, D.2, D.5). The theoretical framework (SPAT) provides principled justification." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title says 'Safe Text-to-Image Generation' in general, but experiments test only Stable Diffusion variants (SD1.5, SD2.1, SDXL) — all from the same model family. No other T2I architectures (DALL-E, Imagen, Midjourney) are tested. The broad framing outpaces the experimental evidence." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper does not consider alternative explanations for its results. For example, whether improvements come primarily from the VLM's conservatism rather than the projection mechanism, or whether the additional compute (LLM+VLM) rather than the specific method explains the gains. §4.4 discusses limitations but not alternative explanations." 134 }, 135 "proxy_outcome_distinction": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper defines IP explicitly via Q16 and NudeNet detectors (§5.1), acknowledges detector dependence in §4.4 ('Behavior therefore depends on the safety taxonomy, the conservatism of the safeguard VLM'), and the impact statement discusses 'false negatives/positives, unequal error rates across user groups.'" 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "Diffusion models are versioned (SD1.5, SD2.1, SDXL). LLM scaling experiments in Appendix D.2 provide exact HuggingFace checkpoint names. However, the default LLM used for the main experiments (Table 1) is not specified, and the Stage-2 VLM is referenced only as 'a safeguard VLM (Lee et al.)' without an exact model version." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": false, 150 "justification": "The A/B scoring protocol token sets are shown in Table 4, and edit-preserving instructions are described in natural language (§4.2). However, the actual full prompt templates sent to the LLM for rewrite proposal and to the VLM for scoring are not provided — only their purpose is described." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": true, 155 "justification": "Appendix C.1 reports: 'τ = 0.05, α_safety = 20, a log-probability threshold of 20, a maximum escalation of 2, 3 local search steps, and 16 candidates.' Top-K=20 for logprob extraction is stated in §4.3." 156 }, 157 "scaffolding_described": { 158 "applies": true, 159 "answer": true, 160 "justification": "The two-stage cascade is described in detail: Algorithm 1 (local search projection) and Algorithm 2 (projected-reference sampling) provide pseudocode. §4.1-4.3 describe the LLM-based surrogate proposal, VLM-based verification, and the scoring protocol." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": true, 165 "justification": "Appendix F documents dataset sources with specific URLs, download procedures, the CoProV2 train/test split protocol, COCO-3K subsampling procedure (shuffle with seed=42, select first 3,000), and the 1,000-pair fitting subset construction." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": true, 172 "justification": "§4.4 'Scope and limitations' provides a dedicated subsection, and the Impact Statement further discusses residual harms, false negatives/positives, and deployment recommendations." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": true, 177 "justification": "§4.4 provides specific threats: 'no global-optimality claim for metric projection onto C_safe,τ,' 'Stage-2 provides sample-level acceptance, not a distributional guarantee,' and behavior depends on 'the safety taxonomy, the conservatism of the safeguard VLM, and the re-sampling budget R.'" 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": true, 182 "justification": "§4.4 states specific scope boundaries: the method provides sample-level rather than distributional guarantees, prompt metric and edit-preserving guidance are not formal invariants, and meaning preservation may conflict with τ when unsafe content is inseparable from core intent." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "Source datasets are publicly available (CoProV2, I2P, UD, COCO). However, the paper's own generated images, computed scores, and intermediate results are not released for independent verification." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "Appendix F describes the source and acquisition of each dataset with specific URLs, version information, and subsampling protocols." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants. All data comes from standard public benchmarks (CoProV2, I2P, UD, COCO)." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": true, 204 "justification": "Appendix F documents the full pipeline: dataset sources, download methods, subsampling with fixed seeds, train/test splitting, and the fitting subset construction (1,000 pairs from CoProV2 training split, disjoint from evaluation)." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": false, 211 "justification": "No funding source, grant number, or acknowledgments section is present in the paper." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Author affiliations are clearly stated: Department of Mathematical Sciences (SNU), Interdisciplinary Program in AI (SNU), and Research Institute of Mathematics (SNU). No commercial product is being evaluated." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is itself a gap." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "No competing interests statement or financial interest declaration is present in the paper." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "The paper does not state training data cutoffs for the LLMs used in the pipeline or for the diffusion models (SD1.5, SD2.1, SDXL). The LLM could have seen CoProV2/I2P/UD prompts during training." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": false, 238 "justification": "No discussion of whether the LLM used for rewriting or the VLM used for scoring may have been trained on the evaluation benchmark prompts." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": false, 243 "justification": "CoProV2 and I2P are public datasets that could appear in LLM training data. If the LLM has seen these prompts and knows they are tagged as 'unsafe,' it could trivially rewrite them. This contamination concern is not addressed." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": true, 287 "justification": "Table 7 (Appendix C.2) reports per-device runtimes per dataset/backbone. Appendix B.2 reports per-prompt wall-clock times (0.38–0.58s for cascade vs 1.82–2.51s for VLM-only). Appendix D.3 ablates per-image time under different hyperparameters." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": true, 292 "justification": "Appendix C.2 states: 'All experiments are conducted on a single node equipped with 8 NVIDIA H100 GPUs (80GB each).' Per-dataset runtimes are provided in Table 7." 293 } 294 }, 295 "experimental_rigor": { 296 "seed_sensitivity_reported": { 297 "applies": true, 298 "answer": false, 299 "justification": "No seed sensitivity analysis is performed. The COCO-3K uses a fixed seed for subsampling, and Appendix D.4 mentions 'single-seed run,' but no results across multiple seeds are reported." 300 }, 301 "number_of_runs_stated": { 302 "applies": true, 303 "answer": false, 304 "justification": "The number of runs for the main results (Table 1) is not stated. Only Appendix D.4 states 'The reported numbers correspond to a single-seed run' for one specific ablation." 305 }, 306 "hyperparameter_search_budget": { 307 "applies": true, 308 "answer": false, 309 "justification": "No information on how many configurations were tried before arriving at the default hyperparameters. Appendix D.1 shows sensitivity analysis but not the search procedure." 310 }, 311 "best_config_selection_justified": { 312 "applies": true, 313 "answer": true, 314 "justification": "§5.1 states 'All model selection (including hyperparameter tuning) is performed only on CoProV2,' with train/test splits (15,690/8,000). This separates selection from evaluation. Appendix D.1 shows sensitivity to different hyperparameter choices." 315 }, 316 "multiple_comparison_correction": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper makes many pairwise comparisons across 12 methods, 3 backbones, and 4 datasets without any correction for multiple comparisons (no Bonferroni, Holm, or Benjamini-Hochberg)." 320 }, 321 "self_comparison_bias_addressed": { 322 "applies": true, 323 "answer": false, 324 "justification": "The authors evaluate their own system against baselines without acknowledging author-evaluation bias. It is unclear whether baseline results were re-run or taken from original papers." 325 }, 326 "compute_budget_vs_performance": { 327 "applies": true, 328 "answer": false, 329 "justification": "The proposed method uses strictly more compute at inference (LLM + VLM + diffusion model) than baselines that only use the diffusion model. While Appendix B.2 reports the cascade speedup vs VLM-only, no comparison is made at matched compute budgets against baselines." 330 }, 331 "benchmark_construct_validity": { 332 "applies": true, 333 "answer": false, 334 "justification": "No discussion of whether CoProV2, I2P, or UD actually measure 'safety' as intended. The paper uses these benchmarks without questioning their construct validity or discussing how well automated detectors (Q16, NudeNet) align with human safety judgments." 335 }, 336 "scaffold_confound_addressed": { 337 "applies": true, 338 "answer": false, 339 "justification": "The method uses an additional LLM+VLM pipeline on top of the base generator, while baselines modify only the generator. This architectural difference is a confound — the improvement could partly be attributed to the additional inference-time compute and models rather than the projection mechanism. This confound is not addressed." 340 } 341 }, 342 "data_leakage": { 343 "temporal_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether the LLM and VLM used in the pipeline were trained on data that includes CoProV2, I2P, or UD prompts, which would give them prior knowledge about which prompts are unsafe." 347 }, 348 "feature_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of feature leakage. The LLM in Stage-1 is explicitly asked whether a prompt is safe/unsafe, which could benefit from having seen benchmark prompt labels during training." 352 }, 353 "non_independence_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of independence between the 1,000-pair CoProV2 fitting subset and the test split. While they state the subsets are 'disjoint' (Appendix F), structural similarities (same LLM-generated prompts) are not addressed." 357 }, 358 "leakage_detection_method": { 359 "applies": true, 360 "answer": false, 361 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination pipelines are used." 362 } 363 } 364 }, 365 "claims": [ 366 { 367 "claim": "Any nontrivial reduction in unsafe generations under a fixed reference model necessarily incurs TV deviation from the reference distribution (SPAT bound).", 368 "evidence": "Theorem 3.1 provides a formal proof: U(G) + ATV(G) ≥ U*, showing that safety gains require larger departures from reference conditionals. Proof in Appendix A.3.", 369 "supported": "strong" 370 }, 371 { 372 "claim": "The method achieves 16.7–60.0% relative reductions in inappropriate percentage versus strong model-level alignment baselines across four datasets and three diffusion backbones.", 373 "evidence": "Table 1 shows IP reductions: on SD1.5, IP drops to 0.04/0.06/0.04 on CoProV2/I2P/UD vs. next-best baselines at 0.06-0.07. On SDXL, IP reaches 0.03/0.04/0.02. Consistent improvements across SD2.1 as well.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "The method preserves benign prompt-image alignment close to the unaligned reference on COCO.", 378 "evidence": "Table 1: SD1.5 FID 32.46 vs 32.34 (no alignment), CLIP 33.36 vs 33.42. SD2.1 FID 32.85 vs 32.78, CLIP 34.92 vs 34.95. SDXL FID 32.45 vs 32.36, CLIP 36.01 vs 36.05. Differences are minimal.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "The tolerance parameter τ provides monotonic control over the safety level.", 383 "evidence": "Fig. 2a shows IP increases monotonically from ~0.04 to ~0.14 as τ increases from 0.05 to 0.5. Fig. 1 provides qualitative visualization of safety-content trade-off across τ values.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "The method is robust to adversarial prompt attacks, with IP rising only from 0.04 to at most 0.06.", 388 "evidence": "Table 2 shows category-wise IP under four attacks (MMA, Ring-A-Bell, SneakyPrompt, P4D). Total IP stays at 0.05-0.06 across all attacks. However, only tested on one backbone (SD1.5) with single-seed runs.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Larger LLMs yield lower IP, with scaling gains largely saturating between 3B-8B.", 393 "evidence": "Fig. 7 shows IP curves for LLaMA and Qwen model families across sizes. LLaMA: 0.06→0.03 (CoProV2) from 1B to 70B. Qwen: 0.12→0.02 from 0.5B to 72B. Evaluated on three benchmarks with fixed SD1.5 backbone.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "97.70% of COCO prompts are left unchanged by the projection, demonstrating near-identity behavior on safe prompts.", 398 "evidence": "Table 3a reports 97.70% unchanged-prompt ratio on COCO (with lowercase/whitespace canonicalization) vs 25.90% for POSI. Fig. 4a shows LDA projection confirming minimal centroid shift.", 399 "supported": "strong" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No error bars or multi-seed experiments", 405 "detail": "All results appear to be single-run (Appendix D.4 explicitly states 'single-seed run' for ablations). With no variance estimates, it is impossible to assess result stability or whether observed differences are significant." 406 }, 407 { 408 "flag": "Unfair compute comparison", 409 "detail": "The method uses LLM + VLM + diffusion model at inference time, while most baselines only modify the diffusion model. The additional compute and model capacity is a confound. The 4.3-5.1x speedup reported is against their own VLM-only variant, not against baselines." 410 }, 411 { 412 "flag": "No human evaluation of safety or meaning preservation", 413 "detail": "All safety assessments rely on automated detectors (Q16, NudeNet). Whether prompt rewrites preserve the user's benign intent and whether the automated detectors agree with human safety judgments is unknown." 414 }, 415 { 416 "flag": "Guard-style baseline metric distortion acknowledged but not corrected", 417 "detail": "The paper notes that LatentGuard and GuardT2I abort 13.53% and 32.87% of COCO captions, creating selection bias in FID computation. Despite acknowledging this, the paper still presents these numbers in the same table without adjusted comparisons." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Safe latent diffusion: Mitigating inappropriate degeneration in diffusion models", 423 "authors": ["Patrick Schramowski", "Manuel Brack", "Björn Deiseroth", "Kristian Kersting"], 424 "year": 2023, 425 "relevance": "Key baseline for T2I safety alignment via latent-space guidance; introduced the IP metric used in this paper's evaluation." 426 }, 427 { 428 "title": "Erasing concepts from diffusion models", 429 "authors": ["Rohit Gandikota", "Joanna Materzynska", "Jaden Fiotto-Kaufman", "David Bau"], 430 "year": 2023, 431 "relevance": "Model-editing approach to T2I safety (ESD-u); demonstrates risks of model-level concept removal for safety alignment." 432 }, 433 { 434 "title": "Unified concept editing in diffusion models", 435 "authors": ["Rohit Gandikota", "Hadas Orgad", "Yonatan Belinkov", "Joanna Materzyńska", "David Bau"], 436 "year": 2024, 437 "relevance": "Inference-time cross-attention modulation for safety alignment (UCE); represents latent-control baseline approach." 438 }, 439 { 440 "title": "Latent guard: a safety framework for text-to-image generation", 441 "authors": ["Runtao Liu", "Ashkan Khakzar", "Jindong Gu", "Qifeng Chen", "Philip Torr", "Fabio Pizzati"], 442 "year": 2024, 443 "relevance": "Contrastive harmfulness detection in latent space above the text encoder; prompt-side guard baseline." 444 }, 445 { 446 "title": "AlignGuard: Scalable safety alignment for text-to-image generation", 447 "authors": ["Runtao Liu", "I-Chao Chen", "Jindong Gu", "Jiaxu Zhang", "Renjie Pi", "Qifeng Chen", "Philip Torr", "Ashkan Khakzar", "Fabio Pizzati"], 448 "year": 2025, 449 "relevance": "Category-specific safety experts inserted into diffusion backbone; main evaluation protocol and CoProV2 dataset used in this paper." 450 }, 451 { 452 "title": "SAFREE: Training-free and adaptive guard for safe text-to-image and video generation", 453 "authors": ["Jaehong Yoon", "Shoubin Yu", "Vaidehi Patil", "Huaxiu Yao", "Mohit Bansal"], 454 "year": 2024, 455 "arxiv_id": "2410.12761", 456 "relevance": "Training-free safety guard that identifies toxic tokens and adapts diffusion timesteps; inference-time safety baseline." 457 }, 458 { 459 "title": "Safeguarding text-to-image generation via inference-time prompt-noise optimization", 460 "authors": ["Jiangweizhi Peng", "Zhiwei Tang", "Guangyao Liu", "Charles Fleming", "Mingyi Hong"], 461 "year": 2024, 462 "arxiv_id": "2412.03876", 463 "relevance": "Gradient-based inference-time optimization of text embeddings and noise trajectories for T2I safety." 464 }, 465 { 466 "title": "Llama guard: LLM-based input-output safeguard for human-AI conversations", 467 "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"], 468 "year": 2023, 469 "arxiv_id": "2312.06674", 470 "relevance": "LLM-based safety classification using prompt/response analysis; foundational work for LLM guard approaches in safety pipelines." 471 }, 472 { 473 "title": "GuardT2I: Defending text-to-image models from adversarial prompts", 474 "authors": ["Yingqian Yang", "Rui Gao", "Xin Yang", "Jingzhi Zhong", "Qiang Xu"], 475 "year": 2024, 476 "relevance": "Conditional LLM approach to defend T2I models from adversarial prompt obfuscation; guard-style baseline." 477 }, 478 { 479 "title": "Value-aligned prompt moderation via zero-shot agentic rewriting for safe image generation", 480 "authors": ["Xin Zhao", "Xiang Chen", "Bo Liu", "Zekun Liu", "Zheyuan Zhao", "Xiaoguang Gu"], 481 "year": 2025, 482 "arxiv_id": "2511.11693", 483 "relevance": "Agentic prompt rewriting with rewrite-generate-verify loop for T2I safety; closest methodological relative to this work." 484 }, 485 { 486 "title": "MMA-diffusion: Multimodal attack on diffusion models", 487 "authors": ["Yijun Yang", "Ruiyuan Gao", "Xiaosen Wang", "Tsung-Yi Ho", "Nan Xu", "Qiang Xu"], 488 "year": 2024, 489 "relevance": "Adversarial attack method for T2I models used as robustness evaluation in this paper." 490 }, 491 { 492 "title": "PromptGuard: Soft prompt-guided unsafe content moderation for text-to-image models", 493 "authors": ["Ling Yuan", "Xin Li", "Chenxi Xu"], 494 "year": 2025, 495 "arxiv_id": "2501.03544", 496 "relevance": "Universal safety soft prompt approach for T2I moderation; embedding-space safety baseline." 497 } 498 ] 499 }