scan.json (30777B)
1 { 2 "paper": { 3 "title": "SecureCAI: Injection-Resilient LLM Assistants for Cybersecurity Operations", 4 "authors": [ 5 "Mohammed Himayath Ali", 6 "Mohammed Aqib Abdullah", 7 "Mohammed Mudassir Uddin", 8 "Shahnawaz Alam" 9 ], 10 "year": 2026, 11 "venue": "arXiv.org", 12 "arxiv_id": "2601.07835", 13 "doi": "10.48550/arXiv.2601.07835" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "SecureCAI extends Constitutional AI with security-specific principles, DPO-based unlearning, and adaptive constitution evolution for defending LLM-assisted cybersecurity operations against prompt injection. The paper reports a 94.7% reduction in attack success rates (from 80.4% to 4.3%) across six attack categories while claiming 95.1% clean accuracy on security analysis tasks. Ablation studies attribute the largest impact to security-specific constitutional principles (ASR increases 4.4× without them). Generalization to four held-out attack types yields 7.9% average ASR compared to 50.5% for standard CAI.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No repository URL, code archive, or supplementary materials link is provided anywhere in the paper. The framework is described conceptually but no implementation is released." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The adversarial dataset (51,750 samples from SIEM, PhishTank, VirusTotal, HarmBench, synthetic, BadPrompt per Table 1) is not released. No download links or data archives are provided." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, dependency list, or environment setup details are provided. The base model is described only as '7B params' without naming the specific model or framework." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The training pipeline (Section 5.2, Figure 3) describes four stages at a conceptual level but lacks sufficient operational detail to reproduce." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 2, 3, 4, and 5 are reported as point estimates without confidence intervals or error bars. No uncertainty quantification is present." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims SecureCAI outperforms all baselines (e.g., 4.3% vs 36.0% ASR) but provides no statistical significance tests — no p-values, t-tests, or bootstrap comparisons. Differences are presented as self-evident from comparing numbers." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Tables 2 and 4 include a '∆Base' and '∆ASR' column showing relative reductions (e.g., -94.7%, +340%). Table 3 provides absolute accuracy figures with full context for comparison. The reader can assess magnitude." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The adversarial dataset totals 51,750 samples (Table 1) and clean evaluation uses 15,000 benign samples (Table 3), but no justification is given for why these sizes were chosen. No power analysis is provided." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No standard deviations, variance, or spread measures are reported across any tables. There is no mention of multiple experimental runs. All results appear to be single-run point estimates." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 compares against four baselines: Base LLM, Standard CAI [2], Input Filtering [7], and Instruction Hierarchy [11]. Table 3 repeats these comparisons for clean accuracy." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include Instruction Hierarchy (Wallace et al., 2024) and Input Filtering (Jain et al., 2023), which are recent and relevant defenses for prompt injection. Standard CAI (Bai et al., 2022) is the foundational method being extended." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 4 presents a systematic ablation study removing individual components: adaptive evolution, DPO training, unlearning, security principles, input sanitization, and reducing to Constitutional SL only. Each configuration is evaluated on ASR and CA." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper reports ASR (Table 2), Clean Accuracy with three sub-metrics (Log F1, Phishing Accuracy, Malware Human Eval in Table 3), and Constitutional Adherence Score (Figure 4)." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "Table 3 lists 'Malw.' as 'Human Eval' in the caption, but the human evaluation protocol is never described — no details on number of evaluators, evaluation criteria, inter-rater agreement, or evaluation procedure. The claim of human evaluation cannot be verified." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "Table 2 evaluates on '51,750 adversarial samples' and the training preference pairs also number 51,750 (Table 1, Figure 2). The paper does not describe any train/test split or confirm that evaluation samples are separate from training samples. Table 5 tests on held-out attack types (5,000 each) but this is a separate generalization experiment, not the main evaluation." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Tables 2, 3, and 5 all provide per-category breakdowns: six attack categories for ASR, three task categories for clean accuracy, and four unseen attack types for generalization." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": false, 108 "justification": "Section 7.3 discusses limitation categories at a high level (computational overhead, novel attack vulnerability, constitution design dependency) but shows no specific failure examples. No qualitative analysis of cases where SecureCAI was breached or misclassified is provided." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": false, 113 "justification": "Every experiment shows SecureCAI performing well. The ablation study (Table 4) shows expected degradation when components are removed but reveals no surprises. No configurations that were tried and abandoned are discussed." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims 94.7% ASR reduction (supported by Table 2: 80.4%→4.3%), 95.1% accuracy (Table 3), and CAS >0.92 (Figure 4). All are directly supported by reported results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims via ablation (Table 4): 'removal of security-specific constitutional principles has the largest impact, increasing ASR to 18.9%.' The ablation design is adequate — controlled single-variable removal across six configurations." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims 'Injection-Resilient LLM Assistants for Cybersecurity Operations' broadly, but experiments use a single unnamed 7B parameter model. No other model sizes or architectures are tested. Section 7 does not bound claims to the specific model or scale tested." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper presents its results as straightforward evidence that SecureCAI works. No alternative explanations are considered — e.g., whether the improvements stem from training data overlap, whether the baselines were implemented at full strength, or whether the adversarial datasets are realistic." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper frames ASR on synthetic adversarial datasets as evidence of resilience in 'operational cybersecurity workflows' and 'Security Operations Centers.' The gap between controlled lab evaluation and real-world SOC deployment is not acknowledged." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "The base model is identified only as 'Pre-trained LLM 7B params' in Figure 3. No model name, family, version, or checkpoint is specified. This makes the work impossible to replicate." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "The five constitutional principles (Section 4.2.1) are described in natural language, but no actual prompt text, system instructions, or critique-revision templates are provided. The reader cannot reconstruct what was sent to the model." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "DPO β=0.1 is stated in Figure 2 and α=0.01 in Figure 3. However, learning rate, batch size, number of epochs, temperature, optimizer, and other standard training hyperparameters are not reported." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "The paper does not use agentic scaffolding. SecureCAI is a training/alignment framework, not an agent with tools or multi-step reasoning loops." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": false, 167 "justification": "Section 5.1 names data sources (SIEM, PhishTank, VirusTotal, HarmBench) and gives sample counts per category, but does not describe how raw data was cleaned, filtered, or transformed. The adversarial injection process is described only as 'systematic injection of attack payloads' without operational detail." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 7.3 'Limitations and Failure Modes' provides a dedicated subsection discussing three categories: computational overhead, novel attack vulnerability, and constitution design dependency." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 7.3 discusses specific threats: 23% inference latency overhead for time-critical applications, 7.9% residual ASR on held-out attacks indicating vulnerability to novel paradigms, and dependence on expert-designed principles that may have gaps. These are specific to this study." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the specific 7B model, the particular attack dataset, or the three security tasks tested. Section 8 mentions future work but does not delineate what current results cannot demonstrate." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw data is released. The 51,750 adversarial samples, 18,420 critique-revision pairs, and 15,000 benign samples are not available for independent verification." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 5.1 describes data sources: SIEM logs for authentication/network/application events, PhishTank for phishing emails, malware repositories for samples, and HarmBench for direct prompt attacks. Table 1 provides counts per category with variant numbers." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": false, 201 "justification": "Section 5.3.3 mentions 'human red-team exercises with security domain experts' and Section 5.1.3 mentions 'human review for a random subset,' but neither describes how these evaluators/experts were recruited, their qualifications, or potential selection bias." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "The paper describes data sources (Section 5.1.1) and final counts (Table 1) but the transformation pipeline between raw artifacts and training-ready preference pairs is not documented with filtering criteria or intermediate counts." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No acknowledgments section, funding statement, or grant numbers appear anywhere in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "Authors list 'Computer Science Department, Cybersecurity and Artificial Intelligence Division' without naming the institution. All four use personal gmail addresses. The actual institutional affiliation is unidentifiable." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial disclosure statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper tests a defense framework against adversarial attacks rather than evaluating a pre-trained model's knowledge on a benchmark. Contamination in the traditional sense (model memorized benchmark answers) does not apply." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper tests defenses rather than model knowledge. Traditional benchmark contamination criteria do not apply, though the apparent train/test overlap (both 51,750 samples) is flagged as a red flag." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "The evaluation tests defense resilience against attacks, not pre-trained model capability on knowledge benchmarks. Contamination criteria are structurally inapplicable." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants as research subjects. Human red-teamers and evaluators are part of the method/evaluation pipeline but are not studied as subjects." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human subjects study. Red-teamers and evaluators participated in a technical evaluation capacity, not as research subjects." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants as research subjects." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants as research subjects." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants as research subjects." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants as research subjects." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants as research subjects." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "Section 7.3 mentions '23% inference latency increase' and Figure 2 notes '+3.2%' for input sanitization, but no absolute latency figures (ms), API costs, or tokens-per-query numbers are provided. Only relative overhead percentages are given." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No GPU hours, training time, hardware specifications, or total compute budget are reported for the four-stage training pipeline." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single run." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs is never stated. Results are presented without any indication of how many times experiments were repeated." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "β=0.1 and α=0.01 are stated but no hyperparameter search process, number of configurations tried, or search method is described. The chosen values appear without justification." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": false, 316 "justification": "The hyperparameter values (β=0.1, α=0.01) are presented without explanation of how they were selected — no validation set results, no sensitivity analysis, no search procedure." 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "No statistical significance tests are performed at all, so multiple comparison correction is moot." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement both SecureCAI and all baselines (Standard CAI, Input Filtering, Instruction Hierarchy) without acknowledging self-comparison bias. Baseline implementations may systematically underperform." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "SecureCAI adds multiple defense layers (input sanitization, constitutional enforcement, guardrails, output validation) with 23% latency overhead, but performance is not compared against baselines at matched compute budgets." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper does not discuss whether synthetic adversarial datasets (including synthetic context overflow attacks and BadPrompt-derived triggers) accurately represent real-world attack scenarios in SOC environments. The gap between lab adversarial datasets and operational threats is not examined." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No agentic scaffolding is involved in the evaluation." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of temporal relationships between training data and evaluation data. The adversarial datasets were presumably generated for this study, but the paper does not clarify temporal ordering." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The preference pairs (51,750) are constructed from the same attack categories and potentially the same samples as the test set. No discussion of whether the evaluation setup leaks information from training." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "Training preference pairs number 51,750 (Table 1, Figure 2) and the evaluation also uses 51,750 adversarial samples (Table 2). The paper does not state whether these are independent sets or the same data, which is a critical omission." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method is applied. No train/test split verification, decontamination, or independence checks are described." 364 } 365 } 366 }, 367 "engagement_factors": { 368 "practical_relevance": { 369 "score": 2, 370 "justification": "Addresses a real SOC deployment scenario for defending LLM assistants against prompt injection, relevant to security practitioners." 371 }, 372 "surprise_contrarian": { 373 "score": 0, 374 "justification": "Extends CAI with security-specific principles — a natural combination that confirms expectations rather than challenging them." 375 }, 376 "fear_safety": { 377 "score": 2, 378 "justification": "Demonstrates high baseline attack success rates (80.4%) on unprotected LLM security assistants, raising concerns about current deployments." 379 }, 380 "drama_conflict": { 381 "score": 0, 382 "justification": "No controversy, no challenge to specific vendors or established claims." 383 }, 384 "demo_ability": { 385 "score": 0, 386 "justification": "No code, demo, or tool released. The base model is not even named." 387 }, 388 "brand_recognition": { 389 "score": 0, 390 "justification": "Unknown authors with personal gmail addresses and unidentifiable institutional affiliation." 391 } 392 }, 393 "claims": [ 394 { 395 "claim": "SecureCAI reduces attack success rates by 94.7% compared to baseline models (from 80.4% to 4.3% average ASR).", 396 "evidence": "Table 2 shows ASR across six attack categories for five methods. SecureCAI achieves 4.3% average vs 80.4% for base LLM. Per-category ASR ranges from 3.2% to 5.1%.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "SecureCAI maintains 95.1% average clean accuracy on benign security analysis tasks, improving over the base LLM (91.5%).", 401 "evidence": "Table 3 reports clean accuracy: Log F1 96.8%, Phishing Acc 95.4%, Malware Human Eval 93.2%, averaging 95.1% vs 91.5% for the base LLM.", 402 "supported": "weak" 403 }, 404 { 405 "claim": "Constitutional adherence scores exceed 0.92 under maximum adversarial pressure.", 406 "evidence": "Figure 4 shows CAS curves under increasing adversarial pressure. SecureCAI maintains CAS >0.92 across all pressure levels with only 6.1% degradation from minimum to maximum.", 407 "supported": "weak" 408 }, 409 { 410 "claim": "Security-specific constitutional principles are the most impactful component, with removal increasing ASR by 340%.", 411 "evidence": "Table 4 ablation: removing security principles increases ASR from 4.3% to 18.9% (∆ASR +340%), the largest impact of any single component removal.", 412 "supported": "moderate" 413 }, 414 { 415 "claim": "SecureCAI generalizes to unseen attack types with 7.9% average ASR vs 50.5% for standard CAI.", 416 "evidence": "Table 5 tests four held-out attack categories (multi-turn, encoding obfuscation, semantic camouflage, role-play) each with 5,000 samples. SecureCAI averages 7.9% ASR.", 417 "supported": "moderate" 418 } 419 ], 420 "red_flags": [ 421 { 422 "flag": "Probable train/test data overlap", 423 "detail": "The training set consists of 51,750 preference pairs (Table 1) and the main evaluation in Table 2 also uses exactly 51,750 adversarial samples. The paper never explicitly states these are separate sets, raising serious concerns about evaluating on training data." 424 }, 425 { 426 "flag": "Unidentified base model", 427 "detail": "The base model is described only as 'Pre-trained LLM 7B params' (Figure 3). No model name, family, or checkpoint is provided. This makes the work completely irreproducible and prevents assessment of whether results transfer to other models." 428 }, 429 { 430 "flag": "Clean accuracy improvement is counterintuitive", 431 "detail": "SecureCAI achieves 95.1% clean accuracy vs 91.5% for the base LLM (Table 3), meaning the safety framework not only doesn't degrade but actively improves task performance. This is unusual — safety training typically involves some accuracy tradeoff — and is not explained." 432 }, 433 { 434 "flag": "No variance or error bars on any results", 435 "detail": "All tables report single point estimates with no standard deviations, confidence intervals, or information about multiple runs. The stability of the claimed 4.3% ASR is unknown." 436 }, 437 { 438 "flag": "Suspicious institutional affiliation", 439 "detail": "All four authors use personal gmail addresses. The affiliation 'Computer Science Department, Cybersecurity and Artificial Intelligence Division' names no institution. This makes it impossible to assess institutional credibility or potential conflicts." 440 }, 441 { 442 "flag": "Baseline implementation fairness unverifiable", 443 "detail": "All baselines (Standard CAI, Input Filtering, Instruction Hierarchy) are re-implemented by the authors without code release. There is no way to verify these implementations are faithful and competitive, and no acknowledgment of self-comparison bias." 444 }, 445 { 446 "flag": "Adversarial pressure metric undefined", 447 "detail": "Figure 4 plots CAS against 'adversarial pressure level' ranging 0.0–1.0, described only as 'normalized measure of attack sophistication and intensity.' The operationalization of this metric is never defined, making the figure uninterpretable." 448 } 449 ], 450 "cited_papers": [ 451 { 452 "title": "Constitutional AI: Harmlessness from AI feedback", 453 "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"], 454 "year": 2022, 455 "arxiv_id": "2212.08073", 456 "relevance": "Foundational method for training harmless AI through self-improvement and constitutional principles, which SecureCAI extends." 457 }, 458 { 459 "title": "BadPrompt: Backdoor attacks on continuous prompts", 460 "authors": ["Xiangrui Cai", "Haidong Xu", "Sihan Xu"], 461 "year": 2022, 462 "relevance": "Demonstrates backdoor attacks on prompt-based models via adaptive trigger optimization, directly relevant to LLM security." 463 }, 464 { 465 "title": "Jailbreaking black box large language models in twenty queries", 466 "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban"], 467 "year": 2023, 468 "arxiv_id": "2310.08419", 469 "relevance": "Black-box jailbreak methodology relevant to evaluating LLM safety and adversarial robustness." 470 }, 471 { 472 "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned", 473 "authors": ["Deep Ganguli", "Liane Lovitt", "Jackson Kernion"], 474 "year": 2023, 475 "arxiv_id": "2209.07858", 476 "relevance": "Establishes red-teaming methodology for language model safety evaluation, used as reference for adversarial testing." 477 }, 478 { 479 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 480 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"], 481 "year": 2023, 482 "relevance": "Defines indirect prompt injection taxonomy for LLM-integrated applications, foundational to SecureCAI's threat model." 483 }, 484 { 485 "title": "Baseline defenses for adversarial attacks against aligned language models", 486 "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen"], 487 "year": 2023, 488 "arxiv_id": "2309.00614", 489 "relevance": "Evaluates baseline defense methods including perplexity-based filtering against adversarial attacks on LLMs." 490 }, 491 { 492 "title": "Prompt injection attack against LLM-integrated applications", 493 "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu"], 494 "year": 2023, 495 "arxiv_id": "2306.05499", 496 "relevance": "Characterizes prompt injection attacks targeting LLM-integrated applications, directly relevant to the attack surface analysis." 497 }, 498 { 499 "title": "Ignore previous prompt: Attack techniques for language models", 500 "authors": ["Fábio Perez", "Ian Ribeiro"], 501 "year": 2022, 502 "arxiv_id": "2211.09527", 503 "relevance": "Early taxonomy of prompt injection attack techniques for language models." 504 }, 505 { 506 "title": "Direct preference optimization: Your language model is secretly a reward model", 507 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"], 508 "year": 2023, 509 "relevance": "Core training methodology adapted by SecureCAI for security-specific preference learning and unlearning." 510 }, 511 { 512 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 513 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leber"], 514 "year": 2024, 515 "arxiv_id": "2404.13208", 516 "relevance": "Proposes instruction hierarchy for LLM safety, used as a baseline defense in SecureCAI evaluation." 517 }, 518 { 519 "title": "Jailbroken: How does LLM safety training fail?", 520 "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"], 521 "year": 2023, 522 "relevance": "Analyzes failure modes of LLM safety training, relevant to understanding why standard defenses are insufficient." 523 }, 524 { 525 "title": "Universal and transferable adversarial attacks on aligned language models", 526 "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter"], 527 "year": 2023, 528 "arxiv_id": "2307.15043", 529 "relevance": "Demonstrates universal adversarial attacks via gradient-based optimization on aligned LLMs." 530 } 531 ] 532 }