scan-v5.json (17484B)
1 { 2 "scan_version": 5, 3 "paper_type": "theoretical", 4 "paper": { 5 "title": "Inference-Only Prompt Projection for Safe Text-to-Image Generation with TV Guarantees", 6 "authors": [ 7 "Minhyuk Lee", 8 "Hyekyung Yoon", 9 "Myungjoo Kang" 10 ], 11 "year": 2026, 12 "venue": "arXiv", 13 "arxiv_id": "2602.00616", 14 "doi": null 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All abstract claims are supported: the SPAT bound is proven in Theorem 3.1, the 16.7–60.0% IP reductions appear in Table 1, and COCO FID/CLIP preservation is reported alongside every baseline.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Causal claims (method causes IP reduction) are supported by controlled comparisons against 11+ baselines on standardized benchmarks plus ablations on hyperparameters, LLM scaling, and label ordering.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "Claims are explicitly scoped to 'four datasets and three diffusion backbones' throughout and in the conclusion; the paper does not claim results extend to all T2I architectures.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper discusses guard-style metric distortion as an explanation for baselines' inflated utility scores, but does not explore alternative explanations for its own superiority—e.g., whether gains stem from VLM verifier conservatism rather than the projection framework.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper explicitly distinguishes ideal unsafety u(G*|c) (unobservable) from the operational proxy buVLM in §3.4 and §4.4, and acknowledges Stage-2 provides sample-level acceptance, not a distributional guarantee.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "Section 4.4 is titled 'Scope and limitations' and the Impact Statement explicitly lists residual harms (false negatives/positives, unequal error rates, misconfiguration risks); both go well beyond a single sentence.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "§4.4 identifies specific threats: no global-optimality claim for metric projection onto Csafe,τ; sample-level (not distributional) acceptance; dependence on VLM conservatism and resampling budget R; inability to preserve meaning when unsafe content is inseparable from intent.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "The paper explicitly states it makes 'no global-optimality claim for metric projection' and that Stage-2 provides 'sample-level acceptance... not a distributional guarantee,' clearly marking what the method does not show.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding acknowledgment appears anywhere in the paper; only institutional affiliations (Seoul National University) are listed.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors' departmental affiliations at Seoul National University are disclosed in the header footnote.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funder is disclosed, making this criterion not applicable.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Total variation (Eq. 1), prompt-wise and population unsafety (Eq. 2–3), ATV (Eq. 4), τ-safe set Csafe,τ (Eq. 7), projection kernel Πτ, and the A/B scoring protocol are all formally defined before use.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The introduction lists four explicit bullet-point contributions: the SPAT bound, the inference-only projection framework, the two-stage cascade operationalization, and the empirical evaluation.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 explicitly positions this work against three categories of prior methods, noting that prior prompt-side methods 'lack an explicit projection notion' and 'rarely describe the prompt-conditioned distributional shift induced by the intervention.'", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "theoretical": { 118 "formal_quality": { 119 "assumptions_stated_explicitly": { 120 "applies": true, 121 "answer": true, 122 "justification": "Assumption 3.2 (main text) and Assumption A.2 (appendix) explicitly enumerate all required conditions—standard Borel space, weak continuity of reference law, closedness of the τ-safe set—before each theorem is invoked.", 123 "source": "haiku" 124 }, 125 "proofs_complete_or_sketched": { 126 "applies": true, 127 "answer": true, 128 "justification": "Theorems 3.1, 3.3, and 3.4 all have full measure-theoretic proofs in Appendix A.3, A.4, and A.6 respectively, with supporting lemmas (A.1, A.3–A.11) each fully proven.", 129 "source": "haiku" 130 }, 131 "bounds_tight_or_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The SPAT lower bound U(G) + ATV(G) ≥ U* is proven but its tightness—whether equality is achievable and under what conditions—is never analyzed or discussed.", 135 "source": "haiku" 136 }, 137 "counterexamples_explored": { 138 "applies": true, 139 "answer": false, 140 "justification": "Edge cases are handled by ruling them out in assumptions (e.g., Csafe,τ nonempty), but no counterexamples are constructed to show where bounds fail under relaxed assumptions.", 141 "source": "haiku" 142 }, 143 "notation_consistent": { 144 "applies": true, 145 "answer": true, 146 "justification": "Notation is consistent throughout: G* for reference, u for prompt-wise unsafety, U for population unsafety, Πτ for projection kernel, tilde for kernel-projected conditionals, and the A/B scoring protocol uses identical symbols in both stages.", 147 "source": "haiku" 148 }, 149 "constructive_vs_existence_noted": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper explicitly states 'Theorem 3.3 is primarily an enabling result: it guarantees (a.e.) existence and measurability, but does not assert uniqueness,' and Algorithm 2 provides the constructive practical approximation.", 153 "source": "haiku" 154 } 155 }, 156 "connections": { 157 "connection_to_practice_discussed": { 158 "applies": true, 159 "answer": true, 160 "justification": "The entire §4 and §5 translate the theoretical SPAT result into a practical two-stage algorithm; the theory directly motivates each design choice (selectivity, projection kernel, τ-control).", 161 "source": "haiku" 162 }, 163 "relationship_to_prior_work_clear": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 2 explicitly contrasts this work with three prior method families, citing specific gaps; §3 formalizes the distributional phenomenon AlignGuard observed empirically but did not explain theoretically.", 167 "source": "haiku" 168 }, 169 "computational_complexity_discussed": { 170 "applies": true, 171 "answer": true, 172 "justification": "The paper notes exact projection is 'infeasible' (§4.2), uses a finite local-search approximation with explicit budgets T and N, and reports per-image wall-clock times in Table 7 and the runtime ablation in Figure 8.", 173 "source": "haiku" 174 }, 175 "limitations_of_formal_model_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "§3.4 and §4.4 explicitly acknowledge that the formal model uses ideal unsafety u(G*|c) which 'is not directly observable in practice,' and treat Πτ as a 'procedure-induced kernel' that only approximately satisfies the formal support restriction.", 179 "source": "haiku" 180 } 181 } 182 } 183 }, 184 "claims": [ 185 { 186 "claim": "Any nontrivial reduction in unsafe generations under a fixed reference model necessarily incurs TV deviation from the reference conditionals (SPAT).", 187 "evidence": "Theorem 3.1 proven via Lemma A.1 (TV controls bounded functional gaps); Theorem 3.4 extends to the kernelized setting.", 188 "supported": "strong" 189 }, 190 { 191 "claim": "The method achieves 16.7–60.0% relative reductions in inappropriate percentage (IP) vs. strong model-level alignment baselines.", 192 "evidence": "Table 1 reports IP scores across CoProV2/I2P/UD on SD1.5, SD2.1, SDXL; relative reductions computed against AlignGuard and other baselines.", 193 "supported": "strong" 194 }, 195 { 196 "claim": "The method preserves benign prompt-image alignment (FID/CLIP) near the unaligned reference on COCO.", 197 "evidence": "Table 1: COCO FID=32.46/CLIP=33.36 (ours) vs. 32.34/33.42 (no alignment) for SD1.5; similar near-parity on SD2.1 and SDXL.", 198 "supported": "strong" 199 }, 200 { 201 "claim": "Adversarial prompt attacks raise IP from 0.04 to at most 0.06 across all four attack methods.", 202 "evidence": "Table 2 reports category-wise IP under MMA, Ring-A-Bell, SneakyPrompt, and P4D attacks on CoProV2.", 203 "supported": "strong" 204 }, 205 { 206 "claim": "Guard-style methods abort 13.53% and 32.87% of benign COCO captions, inflating their reported FID/CLIP metrics.", 207 "evidence": "§5.2 explicitly reports these abort rates and explains the selection bias in utility measurement.", 208 "supported": "strong" 209 }, 210 { 211 "claim": "Larger LLMs yield lower IP scores, with gains mostly saturating between 3B–8B parameters.", 212 "evidence": "Figure 7 shows IP vs. LLM parameter count for LLaMA and Qwen families across three benchmarks; mild non-monotonicity at high end noted.", 213 "supported": "moderate" 214 }, 215 { 216 "claim": "τ provides monotonic, predictable control over the operating safety level.", 217 "evidence": "Figure 2a shows IP increases monotonically with τ on a four-point sweep; Figure 1 shows qualitative image progression across τ values.", 218 "supported": "strong" 219 } 220 ], 221 "methodology_tags": [ 222 "theoretical", 223 "benchmark-eval" 224 ], 225 "key_findings": "The paper proves the Safety-Prompt Alignment Trade-off (SPAT): under total variation, any nontrivial reduction in unsafe image generations from a fixed reference model necessarily deviates from the reference conditional distribution, providing a formal lower bound that explains why model-level alignment degrades benign prompt fidelity. The proposed inference-only prompt projection framework rewrites only high-risk prompts via a two-stage LLM+VLM cascade without retraining the diffusion backbone, achieving 16.7–60.0% relative IP reductions vs. strong baselines while keeping COCO FID/CLIP near the unaligned reference. Guard-style methods inflate reported utility metrics due to 13–33% prompt refusal rates on benign COCO captions, a confound not previously documented. The method is robust to adversarial attacks (IP rises from 0.04 to at most 0.06) and the safety level can be continuously controlled via the tolerance parameter τ.", 226 "red_flags": [ 227 { 228 "flag": "No funding disclosed", 229 "detail": "The paper contains no acknowledgment of funding sources, making potential conflicts of interest impossible to assess." 230 }, 231 { 232 "flag": "Safety proxy not validated", 233 "detail": "Safety is measured using Q16 and NudeNet detectors whose accuracy, false positive/negative rates, and demographic biases are not validated; IP scores may not reflect actual safety for all harmful content categories." 234 }, 235 { 236 "flag": "SPAT bound tightness not discussed", 237 "detail": "The core bound U(G) + ATV(G) ≥ U* is proven but never analyzed for tightness—whether equality is achievable or the bound is loose in practice is unaddressed, limiting its practical diagnostic value." 238 }, 239 { 240 "flag": "Theory-to-practice gap unbounded", 241 "detail": "§3.4 and §4.4 acknowledge that the procedure-induced kernel only 'approximately' satisfies the formal support restriction to Csafe,τ, but the approximation gap is not formally quantified." 242 } 243 ], 244 "cited_papers": [ 245 { 246 "title": "AlignGuard: Scalable Safety Alignment for Text-to-Image Generation", 247 "relevance": "Primary motivating baseline; documents empirically that stronger alignment degrades benign COCO alignment, motivating the SPAT analysis." 248 }, 249 { 250 "title": "Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models", 251 "relevance": "Introduces SLD and the IP/Q16 metric adopted by this paper; foundational model-level alignment baseline." 252 }, 253 { 254 "title": "LatentGuard: A Safety Framework for Text-to-Image Generation", 255 "relevance": "Key prompt-side baseline operating in latent space; directly compared in main results and discussed for abort-rate metric inflation." 256 }, 257 { 258 "title": "SAFREE: Training-Free and Adaptive Guard for Safe Text-to-Image and Video Generation", 259 "relevance": "Inference-time baseline that identifies toxic tokens and adapts diffusion timesteps; compared across all backbones." 260 }, 261 { 262 "title": "Value-Aligned Prompt Moderation via Zero-Shot Agentic Rewriting for Safe Image Generation (VALOR)", 263 "relevance": "Closest prior work combining prompt rewriting with a rewrite-generate-verify loop; key design comparison for prompt-space intervention." 264 }, 265 { 266 "title": "GuardT2I: Defending Text-to-Image Models from Adversarial Prompts", 267 "relevance": "Adversarial-robust guard-style baseline; also illustrates abort-rate metric inflation discussed in §5.2." 268 }, 269 { 270 "title": "Erasing Concepts from Diffusion Models (ESD-u)", 271 "relevance": "Model-editing baseline that modifies diffusion weights to remove unsafe concepts; represents the model-level intervention class." 272 }, 273 { 274 "title": "Universal Prompt Optimizer for Safe Text-to-Image Generation (POSI)", 275 "relevance": "Prompt-rewriting baseline used in projection diagnostic comparison (Table 3a) for near-identity behavior on safe prompts." 276 }, 277 { 278 "title": "PromptGuard: Soft Prompt-Guided Unsafe Content Moderation for Text-to-Image Models", 279 "relevance": "Embedding-space prompt intervention baseline with strong in-domain performance but weaker OOD generalization compared in Table 1." 280 } 281 ], 282 "engagement_factors": { 283 "practical_relevance": { 284 "score": 2, 285 "justification": "Inference-only, no-retraining approach directly deployable on existing T2I models, but requires LLM+VLM infrastructure adding per-image latency." 286 }, 287 "surprise_contrarian": { 288 "score": 2, 289 "justification": "Formally proves that any safety improvement necessarily trades off against prompt-image alignment—an intuitively expected but previously unformalized result that reframes how safety methods should be compared." 290 }, 291 "fear_safety": { 292 "score": 2, 293 "justification": "Directly addresses the safety of deployed T2I systems against harmful image generation and adversarial prompt attacks, with explicit discussion of residual harms." 294 }, 295 "drama_conflict": { 296 "score": 1, 297 "justification": "Shows model-level alignment is fundamentally constrained by SPAT, but the critique of prior work is framed constructively rather than as a takedown." 298 }, 299 "demo_ability": { 300 "score": 2, 301 "justification": "Inference-only algorithm with detailed pseudocode (Algorithms 1–2) and public dataset references; reproducible without access to proprietary infrastructure." 302 }, 303 "brand_recognition": { 304 "score": 1, 305 "justification": "Authors from Seoul National University mathematics department; no major industry lab or product affiliation." 306 } 307 }, 308 "hn_data": { 309 "threads": [], 310 "top_points": 0, 311 "total_points": 0, 312 "total_comments": 0 313 } 314 }