scan.json (32281B)
1 { 2 "paper": { 3 "title": "Refining Input Guardrails: Enhancing LLM-as-a-Judge Efficiency Through Chain-of-Thought Fine-Tuning and Alignment", 4 "authors": [ 5 "Melissa Kazemi Rad", 6 "Huy Nghiem", 7 "Andy Luo", 8 "Sahil Wadhwa", 9 "Mohammad Sorower", 10 "Stephen Rawls" 11 ], 12 "year": 2025, 13 "venue": "arXiv", 14 "arxiv_id": "2501.13080", 15 "doi": "10.48550/arXiv.2501.13080" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "Supervised fine-tuning (SFT) with only 400 training examples significantly improves LLM performance as input moderation guardrails, with maximum F1 lift of 227% and ADR lift of 344% across four LLMs. DPO and KTO alignment provide marginal additional gains over SFT alone. The best model (Llama3-DPO, F1=96.1) substantially outperforms LlamaGuard-2 (F1=69.2) and DeBERTaV3 (F1=81.4) on the authors' curated test set. Qualitative analysis reveals base LLMs fail to recognize adversarial patterns like token-level jailbreaks, while fine-tuned versions detect them at the cost of slightly increased false positive rates (0.1% → 1.6%).", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. No mention of code release plans." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "The curated training/test dataset is not released. While the source datasets (AdvBench, MaliciousInstruct, Forbidden Question Set, Jailbreak Prompt Set) are public, the authors' specific selections, splits, annotations, and synthetically generated negative examples are not made available." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "Appendix D mentions AWS P4D.24XLARGE with 8 A100 40GB GPUs, 96 vCPUs, and 1152 GiB memory, but no software environment details (Python version, library versions, requirements.txt, Dockerfile) are provided." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While training hyperparameters are listed in Table 3, the overall pipeline cannot be reproduced without code, data, and instructions." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results (F1, ADR, FPR, Invalid Response Ratio) are reported as point estimates in Figure 3 and Table 1 with no confidence intervals, error bars, or uncertainty measures." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims 'SFT leads to the most significant lift' and compares multiple methods, but no statistical significance tests (p-values, t-tests, bootstrap) are used. All comparisons are based on comparing raw numbers." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper reports relative improvements with baseline context, e.g., 'maximum lift of 227% and 344% in F1 and ADR, respectively' and shows baseline vs. tuned values. Table 1 shows Llama3-DPO (ADR 93.3) vs LlamaGuard-2 (ADR 54.2), giving readers context for the magnitude of differences." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The training set of 400 and test set of ~6800 examples are stated but never justified. No power analysis or reasoning for why these sizes are adequate for the claims being made." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No mention of multiple experimental runs, standard deviations, or variance measures. Results appear to be from single runs for each configuration." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The paper compares against base (untuned) versions of all four LLMs, and additionally compares the best model (Llama3-DPO) against LlamaGuard-2, ProtectAI DeBERTaV3, and Meta PromptGuard in Table 1." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Baselines include LlamaGuard-2 (2024), PromptGuard (2024), and DeBERTaV3 — all contemporary guardrail models. The base LLMs themselves (Llama3 8B Instruct, Mistral 7B v0.2) are also recent." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper systematically compares Base → SFT → SFT+DPO and SFT+KTO across all four LLMs, effectively ablating the contribution of each training stage. Figure 3 shows these comparisons across all metrics." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Four metrics are reported: F1 Score, Attack Detection Ratio (ADR/recall), False Positive Rate (FPR), and Invalid Response Ratio (Figure 3)." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation of the guardrail system's outputs is conducted. Human annotation was used for creating training data (Table 4, Appendix F), but humans did not evaluate the system's test-time outputs." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The paper clearly separates 400 training and ~6800 test examples: 'any type of query used in the training dataset is excluded from the test set' (Appendix C)." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Figure 4 provides ADR breakdowns for three query types: standalone jailbreak, malicious queries with jailbreak prompts, and standalone malicious queries, across all LLMs and tuning strategies." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Tables 5-7 and Appendix H provide qualitative analysis of failure cases: Table 5 shows base model failures on adversarial queries, Table 7 shows false positives from the KTO-aligned model, including users with disabilities being incorrectly flagged." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports FPR increases as a negative outcome (0.1% to 1.6% for Llama3-KTO), and Table 7 explicitly shows false positive examples where the KTO model is 'hyper-sensitive.' They note 'surprisingly a relatively small increase in valid queries incorrectly classified as fraudulent.'" 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims 'significantly enhance the safety of conversational AI systems' and 'potential of alignment processes tailored to a varied range of harmful input queries, even with constrained data resources.' These are supported by the experimental results showing large F1/ADR improvements (Figure 3) with only 400 training examples." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper makes causal claims like 'SFT leads to the most significant lift' through a controlled sequential design: Base → SFT → SFT+DPO/KTO. Each step adds one component while holding others fixed, constituting adequate single-variable manipulation for causal inference about training method effects." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The abstract claims these techniques 'significantly enhance the safety of conversational AI systems' and provide 'a feasible framework for deploying more secure and trustworthy AI-driven interactions,' but testing is limited to one violation category (malicious/jailbreak) on one curated test distribution. The paper acknowledges 'covering all existing attack vectors... can be a limiting factor' but the title and framing are broader than the evidence." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not discuss alternative explanations for the observed improvements, such as whether SFT gains come from format compliance vs. genuine reasoning improvement, whether the test set distribution particularly favors fine-tuned models, or whether memorization of training patterns explains performance." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper measures F1/ADR/FPR on a curated binary classification test set and frames this as 'enhancing the safety of conversational AI systems' and 'a feasible framework for deploying more secure AI-driven interactions.' No discussion of the gap between test set performance and real-world deployment safety is provided." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific model versions are provided: Mistral-7B-Instruct-v0.2, Mixtral-8x7B-Instruct-v0.1, Llama2-13B-Chat, and Llama3-8B-Instruct (Table 3 and throughout)." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Figure 6 shows the prompt structure with color-coded sections (system prompt, CoT prompt, user query + policy definition). Figure 7 shows the custom safety prompt added to LlamaGuard-2. Table 4 shows example accepted/rejected responses with their formatting." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Table 3 reports LoRA rank (256), alpha, learning rates, batch sizes, beta (0.1), and epochs for all three tuning strategies. Evaluation uses 'top p 1 and temperature 0.' A #END trigger token is used to prevent repetitive generation." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The approach is standard LLM fine-tuning and single-pass inference." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Appendix C documents the full data construction: selection of 128 distinct AdvBench examples, all 100 MaliciousInstruct, 180 from Forbidden Question Set, 240 jailbreak prompts, 3600 synthetic negatives. Training/test split proportions and exclusion criteria are described." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Appendix I is titled 'Discussions and Limitations' and provides substantive discussion of resource requirements, training time tradeoffs, and limitations of attack vector coverage." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "The paper identifies specific threats: 'covering all existing attack vectors in both fine-tuning and evaluation datasets can be a limiting factor,' larger LLMs 'may require a larger training set or additional training epochs,' and alignment techniques may need 'a larger and/or more diverse set of rejected responses.'" 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "While the paper states its scope is 'the malicious/jailbreak category' and acknowledges attack coverage limitations, it does not systematically enumerate what was not tested (other attack types, other domains, production conditions, adaptive adversaries, etc.)." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "The curated dataset (training annotations, test splits, rejected responses) is not released. Only the source datasets are publicly available, but not the authors' specific selections and annotations." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Appendix C describes in detail which datasets were used, how many examples from each, the selection criteria (e.g., '128 distinct ones that fall under the category of fraudulent requests'), and how negative examples were synthetically generated." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data sources are standard public datasets (AdvBench, MaliciousInstruct, Forbidden Question Set, Jailbreak Prompt Set) with synthetic negative examples." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Appendix C documents the pipeline: source dataset selection → example filtering → proportional sampling for training → test set construction from remaining examples → accepted response generation via Mixtral → manual annotation → rejected response synthesis (3 per query, 1200 total)." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding or acknowledgments section is present. Five of six authors are affiliated with Capital One, but no explicit funding disclosure is made." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: five authors from Capital One and one from University of Maryland, College Park." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "Five of six authors are Capital One employees. Capital One, as a financial services company deploying conversational AI, has a direct commercial interest in demonstrating that fine-tuned LLM guardrails are effective for their products." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement is present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the four base models (Mistral 7B, Mixtral 8x7B, Llama2 13B, Llama3 8B), despite evaluating their base (untuned) performance on the test set." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether the base models' pre-training data included examples from the public test set sources (AdvBench, MaliciousInstruct, etc.), all of which were published before these models were trained." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "The test data comes from public datasets (AdvBench 2023, MaliciousInstruct 2023, Jailbreak Prompt Set 2024) that predate or overlap with Llama3's training period. This contamination risk is not discussed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants. The study involves LLM fine-tuning and evaluation on curated datasets." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. The study uses publicly available datasets and synthetic data." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in the study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference cost or latency is reported despite the paper noting 'low response latency is critical.' Figure 9 compares parameter counts as 'a proxy for inference latency' but does not measure actual latency or cost." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": true, 296 "justification": "Appendix D specifies AWS P4D.24XLARGE instances with 8 A100 40GB GPUs. Table 2 reports training hours for each experiment (e.g., Llama3 SFT: 2.5h, +DPO: 0.5h, +KTO: 2h). GPU counts per task are also stated (3, 6, 7 GPUs for SFT, DPO, KTO)." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single training runs." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they come from single or multiple runs." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": true, 313 "justification": "Table 3 reports the hyperparameter search space (LoRA rank, alpha, learning rate, batch size, beta, epochs) and the paper states 'We have not conducted an exhaustive hyperparameter search... due to resource constraints. Instead, we considered a small hyperparameter space using Mistral-7B-Instruct-v0.2.'" 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "The paper says 'The best hyperparameters discovered were then adopted in the other three LLMs' but does not explain the selection criterion — no mention of a validation set or what metric determined 'best.' It is unclear whether selection was done on test data." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors compare their fine-tuned models against external baselines (LlamaGuard-2, DeBERTaV3, PromptGuard) on their own curated test set without acknowledging the bias of evaluating on a dataset they designed." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": true, 333 "justification": "Figure 9 explicitly plots training hours vs F1 Score vs number of parameters for all experiments and LLMs, directly addressing the compute-performance tradeoff." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of whether the curated test set (from AdvBench, MaliciousInstruct, etc.) is representative of real-world attack distributions or whether performance on this benchmark generalizes to production guardrail effectiveness." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved. The approach is standard fine-tuning with direct inference." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "The source datasets (AdvBench, MaliciousInstruct, Jailbreak Prompt Set — all published 2023-2024) predate some base models (Llama3, 2024). No discussion of whether these test examples appeared in pre-training data." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information. The binary classification prompt explicitly instructs the model about the policy violation category, which could serve as a strong feature." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "Training and test examples are drawn from the same source datasets (AdvBench, MaliciousInstruct, etc.) and while specific examples don't overlap, no analysis of distributional similarity or near-duplicate patterns between train and test is provided." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention method (canary strings, membership inference, n-gram overlap) is applied." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "SFT leads to the most significant performance lift across all LLMs, with maximum improvements of 227% in F1 and 344% in ADR.", 372 "evidence": "Figure 3 shows F1, ADR, FPR, and Invalid Response Ratio for base vs. SFT vs. SFT+DPO vs. SFT+KTO across four LLMs. The maximum lifts are stated in the Results section.", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "DPO and KTO alignment provide additional improvements over SFT, with KTO slightly outperforming DPO in F1 and ADR in most LLMs.", 377 "evidence": "Figure 3 shows marginal gains from DPO/KTO over SFT. 'KTO slightly outperforming DPO in F1 and ADR in all LLMs, with the exception of Llama3 8B Instruct.'", 378 "supported": "weak" 379 }, 380 { 381 "claim": "Llama3-DPO outperforms LlamaGuard-2 in ADR by 172% and reduces FPR by 275%.", 382 "evidence": "Table 1: Llama3-DPO achieves ADR 93.3%, FPR 0.8% vs. LlamaGuard-2 ADR 54.2%, FPR 2.2% on the full test set.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Base LLMs are poorly equipped against standalone jailbreaks, with ADR values as much as 7.5x smaller than for jailbreaks with malicious queries.", 387 "evidence": "Figure 4 shows baseline ADR broken down by query type. Standalone jailbreak ADR is substantially lower than combined jailbreak+malicious across all four base LLMs.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "These performance improvements can be obtained with small training datasets (400 examples) and minimal hyperparameter tuning.", 392 "evidence": "Training set is 400 examples (200 positive, 200 negative). Hyperparameter search was limited (Table 3, Appendix E). Results on ~6800 test examples show significant improvements.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Fine-tuning reduces the invalid response ratio, with Llama3 8B Instruct dropping from 16.8% to 0.3% with DPO.", 397 "evidence": "Figure 3, lower right panel. 'The invalid response ratio categorically drops with any tuning technique, by a wider margin for Llama3 8B Instruct.'", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Smaller tuned LLMs can achieve better results as input guardrails than larger counterparts with relatively smaller resources.", 402 "evidence": "Appendix I discusses this: Llama3-DPO (8B) outperforms Mixtral 8x7B and Llama2 13B variants with less training time. Figure 9 visualizes this tradeoff.", 403 "supported": "moderate" 404 } 405 ], 406 "red_flags": [ 407 { 408 "flag": "No error bars or variance reporting", 409 "detail": "All results are single-run point estimates with no uncertainty quantification. Given the small training set (400 examples), results could vary substantially across runs or seeds. The marginal DPO/KTO gains over SFT may not be statistically significant." 410 }, 411 { 412 "flag": "Evaluation on authors' own curated test set", 413 "detail": "The comparison against external baselines (LlamaGuard-2, DeBERTaV3, PromptGuard) is conducted on the authors' own curated test set, which was designed for their specific policy definition. PromptGuard's 99.8% FPR suggests a severe distribution mismatch rather than genuine model failure, as PromptGuard was designed for prompt injection detection, not their malicious content taxonomy." 414 }, 415 { 416 "flag": "Corporate conflict of interest", 417 "detail": "Five of six authors are Capital One employees evaluating a guardrail approach relevant to Capital One's products. No conflict of interest or funding disclosure is provided. The authors have a commercial motivation for positive results." 418 }, 419 { 420 "flag": "Hyperparameter selection methodology unclear", 421 "detail": "Hyperparameters were tuned on Mistral 7B and transferred to other models. The selection criterion for 'best' is not stated — no validation set is mentioned, raising the possibility of test set selection." 422 }, 423 { 424 "flag": "Unfair baseline comparison with PromptGuard", 425 "detail": "PromptGuard achieves 99.8% FPR on their dataset, indicating it was evaluated outside its intended use case (it detects prompt injection, not malicious content broadly). Reporting that Llama3-DPO achieves '99% FPR reduction over PromptGuard' is misleading when the tool was not designed for their task." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 431 "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J. Z. Kolter", "M. Fredrikson"], 432 "year": 2023, 433 "arxiv_id": "2307.15043", 434 "relevance": "Foundational work on adversarial suffix attacks against LLMs showing transferability to black-box models; source of the AdvBench dataset used in this study." 435 }, 436 { 437 "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations", 438 "authors": ["H. Inan", "K. Upasani", "J. Chi", "R. Rungta", "K. Iyer", "Y. Mao"], 439 "year": 2023, 440 "arxiv_id": "2312.06674", 441 "relevance": "LLM-based safety guardrail for human-AI conversations; LlamaGuard-2 is a key baseline in this study." 442 }, 443 { 444 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 445 "authors": ["R. Rafailov", "A. Sharma", "E. Mitchell", "S. Ermon", "C. D. Manning", "C. Finn"], 446 "year": 2024, 447 "arxiv_id": "2305.18290", 448 "relevance": "Core alignment technique used in this study; DPO is one of two alignment methods evaluated for improving guardrail CoT reasoning." 449 }, 450 { 451 "title": "KTO: Model Alignment as Prospect Theoretic Optimization", 452 "authors": ["K. Ethayarajh", "W. Xu", "N. Muennighoff", "D. Jurafsky", "D. Kiela"], 453 "year": 2024, 454 "arxiv_id": "2402.01306", 455 "relevance": "Alternative alignment technique evaluated in this study; KTO uses prospect theory loss and only requires binary signals rather than preference pairs." 456 }, 457 { 458 "title": "Jailbreaking Black Box Large Language Models in Twenty Queries", 459 "authors": ["P. Chao", "A. Robey", "E. Dobriban", "H. Hassani", "G. J. Pappas", "E. Wong"], 460 "year": 2024, 461 "arxiv_id": "2310.08419", 462 "relevance": "Automated black-box jailbreak attack method; motivates the need for robust input guardrails studied in this paper." 463 }, 464 { 465 "title": "NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications with Programmable Rails", 466 "authors": ["T. Rebedea", "R. Dinu", "M. Sreedhar", "C. Parisien", "J. Cohen"], 467 "year": 2023, 468 "arxiv_id": "2310.10501", 469 "relevance": "Programmable guardrails toolkit for LLM applications; represents an alternative approach to the LLM-as-a-Judge guardrail paradigm studied here." 470 }, 471 { 472 "title": "Jailbroken: How Does LLM Safety Training Fail?", 473 "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"], 474 "year": 2024, 475 "relevance": "Analysis of how safety training fails against adversarial attacks; contextualizes the vulnerability of base LLMs that this paper's guardrails address." 476 }, 477 { 478 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 479 "authors": ["E. J. Hu", "Y. Shen", "P. Wallis", "Z. Allen-Zhu", "Y. Li", "S. Wang", "L. Wang", "W. Chen"], 480 "year": 2021, 481 "arxiv_id": "2106.09685", 482 "relevance": "Parameter-efficient fine-tuning method used in all experiments to mitigate catastrophic forgetting during guardrail adaptation." 483 }, 484 { 485 "title": "Training Language Models to Follow Instructions with Human Feedback", 486 "authors": ["L. Ouyang", "J. Wu", "X. Jiang", "D. Almeida", "C. L. Wainwright"], 487 "year": 2022, 488 "arxiv_id": "2203.02155", 489 "relevance": "RLHF methodology for aligning LLMs with human preferences; the study's DPO/KTO approach is positioned as a more efficient alternative." 490 }, 491 { 492 "title": "Do Anything Now: Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models", 493 "authors": ["X. Shen", "Z. Chen", "M. Backes", "Y. Shen", "Y. Zhang"], 494 "year": 2024, 495 "arxiv_id": "2308.03825", 496 "relevance": "Comprehensive study of jailbreak prompts in the wild; source of the Forbidden Question Set and Jailbreak Prompt Set used for training and evaluation." 497 }, 498 { 499 "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically", 500 "authors": ["A. Mehrotra", "M. Zampetakis", "P. Kassianik", "B. Nelson", "H. Anderson", "Y. Singer", "A. Karbasi"], 501 "year": 2024, 502 "arxiv_id": "2312.02119", 503 "relevance": "Automated multi-turn jailbreak attack method; represents the evolving threat landscape that motivates input guardrail research." 504 }, 505 { 506 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 507 "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "E. H. Chi", "Q. Le", "D. Zhou"], 508 "year": 2022, 509 "relevance": "Foundational CoT prompting work that motivates the study's use of CoT reasoning to improve guardrail accuracy and explanation quality." 510 } 511 ], 512 "engagement_factors": { 513 "practical_relevance": { 514 "score": 2, 515 "justification": "Directly applicable to companies deploying conversational AI with safety requirements; the SFT approach with small data is practical but no code is released." 516 }, 517 "surprise_contrarian": { 518 "score": 0, 519 "justification": "Confirms the expected finding that fine-tuning improves task-specific LLM performance; no surprising or contrarian results." 520 }, 521 "fear_safety": { 522 "score": 2, 523 "justification": "Addresses LLM safety vulnerabilities and demonstrates that base models are poorly equipped against jailbreaks, raising security concerns for deployed systems." 524 }, 525 "drama_conflict": { 526 "score": 0, 527 "justification": "No controversy or conflict; straightforward empirical comparison of fine-tuning strategies." 528 }, 529 "demo_ability": { 530 "score": 0, 531 "justification": "No code, demo, or model weights released; results cannot be reproduced or tried by practitioners." 532 }, 533 "brand_recognition": { 534 "score": 1, 535 "justification": "Capital One is a recognizable brand but not a prominent AI research lab; the work is not about a widely known AI product." 536 } 537 } 538 }