ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27993B)


      1 {
      2   "paper": {
      3     "title": "SPIN: Self-Supervised Prompt INjection",
      4     "authors": ["Leon Zhou", "Junfeng Yang", "Chengzhi Mao"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2410.13236",
      8     "doi": "10.48550/arXiv.2410.13236"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SPIN proposes an inference-time defense against LLM jailbreak attacks using self-supervised tasks (repeat, interjection) for detection and perplexity-based token optimization for reversal. On AdvBench with GCG attacks, SPIN reduces ASR to 12.11% on Vicuna-7b and 0% on Llama-2-7b. The method also defends against natural language jailbreaks, adversarial instructions, and role-play attacks, reducing ASR to 0% on both models. However, all results are single-run without variance reporting, tested on only two 7B models from the same family, and thresholds appear to be selected on evaluation data.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or GitHub link is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: AdvBench 'harmful behaviors' (Zou et al., 2023) and TriviaQA (Joshi et al., 2017). No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or library version information is provided. The paper mentions using Llama-2 and Vicuna but provides no environment setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Key implementation details like the exact GCG optimization setup, token substitution procedure, and threshold calibration process would need to be reverse-engineered from the paper text."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All ASR results in Figures 5-7 and Table 1 are reported as point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes many comparative claims (e.g., 'reduces ASR by up to 87.9%', SPIN vs. baselines) without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports ASR reductions with baseline context (e.g., Vicuna GCG goes from 100% no-defense to 12.11% with SPIN; Llama-2 from ~30% to 0%). The magnitude of improvement is clear from the bar charts and reported numbers."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for why 520 AdvBench examples, 150 samples per natural language attack, or the top 5 jailbreak prompts were chosen. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. All ASR numbers appear to be single-run results."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against Perplexity filter (Jain et al., 2023) and RA-LLM (Cao et al., 2023) as defense baselines, plus a 'No Defense' condition (Figure 5)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Both baselines (Perplexity filter and RA-LLM) are from 2023, which is recent and relevant for a 2024 paper on LLM defense."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Figure 7 presents an ablation study showing ASR for individual defense layers (Repeat, Interjection, Reversal) and their combinations (Repeat+Reversal, Interjection+Reversal, Full Defense). The right panel also shows reversal performance across different optimization steps."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports ASR (attack success rate) as the primary metric, TriviaQA accuracy as a benign performance metric, latency in seconds (Table 1), and ROC/AUC for detection (Figure 4)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated (keyword-matching for ASR, automated TriviaQA scoring). Human judges could have assessed whether responses flagged as harmful were truly harmful or whether defenses degraded response quality on benign inputs."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The detection thresholds (0.89 for repeat, 5.73/6.55 for interjection) appear to be selected from ROC curves computed on the same AdvBench data used for evaluation (Figure 4). No separate validation/test split is described."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by attack type: Jailbreak, GCG, Adversarial Instructions, Roleplay, ICA, AutoDAN, and CodeChameleon (Figure 5), as well as by defense layer combination (Figure 7)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses why jailbreak prompts are harder for reversal (coherent sentence structure, same perplexity as benign inputs, Section 4.4) and acknowledges that adaptive attackers can partially bypass individual layers (Figure 6)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper shows that reversal alone is less effective against jailbreaks than against GCG attacks (Figure 7 left), and that adaptive attackers can reduce defense effectiveness from 87.9% to 76% (Section 4.5)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'reduce ASR by up to 87.9%' which is supported by Vicuna GCG results (100% → 12.11%). 'Maintaining performance on benign user requests' is shown via TriviaQA results. 'Resilient against adaptive attackers' is shown in Figure 6 (76% reduction under adaptive attack)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('our method reduces ASR'). The ablation study (Figure 7) provides controlled single-variable manipulation by testing each defense layer independently and in combination, which is adequate for these causal claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims the method is 'model agnostic' (Section 1) but tests only two 7B models from the same family (Llama-2-chat and Vicuna-7b, which is fine-tuned from Llama-2). No testing on different model families, sizes, or closed-source models despite claiming transferability."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why the defense works. For instance, the repeat task's effectiveness could be due to the safety alignment itself (refusing to repeat harmful content) rather than the self-supervised signal detecting attack structure. No robustness checks against this alternative interpretation."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses keyword-matching ASR (checking if responses begin with common alignment refusals) as a proxy for actual defense effectiveness. A response that doesn't start with a refusal is counted as a successful attack, but this doesn't verify whether the response is actually harmful. TriviaQA is used as a proxy for 'benign user request performance' generally, but is a single narrow task. Neither proxy gap is acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper specifies 'Llama 2-chat' (Section 4.2) and 'Vicuna-7b' with sufficient detail — both are open-source models with well-defined weights. Vicuna is noted as 'based on finetuning Llama 2 with conversations from ShareGPT.'"
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The actual defense prompts are provided: 'Repeat only the following sentences exactly without changes: ' for the repeat task (Section 4.4) and 'But first, answer only what is the capital of France?' for the interjection task. The initial reversal tokens '! ! ! ! !' are also specified."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key hyperparameters are reported: GCG 500 iterations, 5 defense tokens, 25 reversal optimization steps, batch size 50, top-256 token candidates, detection thresholds of 0.89 (repeat) and 5.73/6.55 (interjection), check every 5 iterations during reversal."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The system is a straightforward inference-time pipeline (detect → reverse → generate)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper describes how evaluation data was constructed: 520 AdvBench requests with GCG suffixes optimized for up to 500 iterations, top 5 jailbreak prompts each paired with 30 random AdvBench requests (=150), adversarial instructions paired with 150 samples, and similar for role-play attacks (Section 4.1, 4.3)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The conclusion (Section 5) is brief and does not discuss limitations. The computational overhead discussion in Section 4.6 touches on a practical limitation but is framed as an ablation, not a limitation."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The paper does not address concerns like threshold generalization to unseen attack types, dependency on the specific models tested, or the keyword-matching ASR metric's validity."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do not show. There is no discussion of boundaries regarding model size, model family, language, attack sophistication, or real-world deployment considerations."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (per-example ASR results, generated responses, or detection scores) is released. Only aggregate results in figures are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data collection procedure is described: AdvBench 520 harmful behaviors with GCG-optimized suffixes, top 5 jailbreak prompts from JailbreakChat paired with random AdvBench samples, and TriviaQA wikipedia validation set (Sections 4.1, 4.3)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. All data sources are standard public benchmarks (AdvBench, TriviaQA, JailbreakChat)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: malicious requests from AdvBench → append attack suffixes (GCG-optimized or natural language) → run through SPIN defense layers (detect → reverse) → measure ASR by checking for alignment refusal prefixes."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 6 acknowledges funding: 'This work was supported in part by multiple Google Cyber NYC awards, Columbia SEAS/EVPR stimulus award, and Columbia SEAS-KFAI Generative AI and Public Discourse Research award.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Columbia University (Leon Zhou, Junfeng Yang) and Rutgers University (Chengzhi Mao). They are not evaluating their own company's product."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Google's Cyber NYC awards and Columbia internal grants fund this work. The paper evaluates open-source models (Llama-2, Vicuna), not Google products. The funders do not have a direct financial stake in the specific outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests a defense mechanism against attacks, not the model's knowledge or capability on a benchmark. Per schema, contamination items are NA for studies that test defenses rather than model knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as training_cutoff_stated: the paper evaluates a defense system, not model knowledge on a benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as training_cutoff_stated: the paper evaluates a defense system, not model knowledge on a benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 1 reports latency for each defense component: standard inference 2.80s, repeat 0.45s, interjection 0.89s, reversal 12.62s, full defense 16.31s, averaged across 100 samples."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget (GPU hours, hardware specifications, total experiment time) is stated. Only per-inference latency is reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity analysis. The GCG optimization and token selection involve randomness (uniform picking from top-256 tokens) but no multi-seed results are reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of independent experimental runs is not stated. Latency is averaged over 100 samples (Table 1), but ASR results do not indicate how many independent runs produced them."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Detection thresholds are derived from ROC curves, but no hyperparameter search budget is reported for other parameters (number of defense tokens, optimization steps, batch size, top-k candidates)."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The detection thresholds (0.89 and 5.73) are selected as 'best thresholds' from ROC curves (Figure 4) computed on the same data used for evaluation. No held-out validation set is described for threshold selection."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper compares across 7 attack types, 2 models, and multiple defense configurations without any correction for multiple comparisons. No statistical tests are performed at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own defense system and implement the baseline defenses themselves. No discussion of potential bias from author-implemented baselines or author-evaluated systems."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 7 (right) shows ASR as a function of reversal iterations (0-25 steps) for both models, and Table 1 reports the latency cost of each defense component. The trade-off between computation and defense performance is explicitly discussed in Section 4.6."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether keyword-matching ASR on AdvBench actually measures real-world defense effectiveness. The paper does not question whether AdvBench harmful behaviors are representative of real attacks or whether the ASR metric captures meaningful safety."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is involved in this work."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "AdvBench was published in 2023 and Llama-2's training likely includes data from the same period. The paper does not discuss whether the models' safety training may have included AdvBench examples, which could inflate defense effectiveness metrics."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. For example, the keyword-matching ASR metric (checking for refusal prefixes) is a feature of the model's safety training rather than a ground-truth harmfulness assessment."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper pairs the same 150 AdvBench samples with different attack types without discussing potential non-independence between these overlapping evaluations."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SPIN can reduce the attack success rate by up to 87.9% while maintaining performance on benign user requests",
    365       "evidence": "Figure 5 shows Vicuna GCG ASR drops from 100% (no defense) to 12.11% (SPIN). TriviaQA performance is maintained across defense conditions. Section 4.4.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "SPIN reduces ASR to 0% on Llama-2 for GCG attacks, natural language jailbreaks, adversarial instructions, and role-play attacks",
    370       "evidence": "Figure 5 (right panel) shows 0% ASR on Llama-2-7b for these attack types with full SPIN defense. However, these are single-run results without variance reporting.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "The method is resilient against adaptive attackers who are aware of the defense",
    375       "evidence": "Figure 6 shows that adaptive attacks can partially bypass individual layers but the multi-layer defense still reduces ASR by up to 76% under adaptive attack (Section 4.5).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The method is model agnostic and compatible with existing defense systems",
    380       "evidence": "Tested on only two 7B models from the same family (Llama-2-chat and Vicuna-7b). No testing on different architectures, sizes, or closed-source models.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "Self-supervised tasks can detect jailbreak attacks because adversarial prompts degrade other model capabilities",
    385       "evidence": "Figure 4 shows clear separation in repeat-task loss (AUC=0.99) and interjection-task loss (AUC=0.85) between benign and malicious inputs. Section 3.2 explains the theoretical motivation.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Threshold selection on evaluation data",
    392       "detail": "Detection thresholds (0.89 for repeat, 5.73/6.55 for interjection) are derived from ROC curves computed on the same AdvBench data used for evaluation (Figure 4). No held-out validation split is described, risking overfitting the thresholds to the evaluation distribution."
    393     },
    394     {
    395       "flag": "No error bars or variance across runs",
    396       "detail": "All ASR results are single-run point estimates. The GCG optimization and token selection involve stochasticity, but no multi-seed or multi-run results are reported, making it impossible to assess result stability."
    397     },
    398     {
    399       "flag": "Limited model diversity for model-agnostic claims",
    400       "detail": "The paper claims 'model agnostic' but tests only two 7B models from the same Llama-2 family. Vicuna-7b is fine-tuned from Llama-2. No testing on different architectures (e.g., GPT, Mistral), sizes, or closed-source models."
    401     },
    402     {
    403       "flag": "Keyword-matching ASR metric",
    404       "detail": "ASR is measured by checking whether the response matches 'common denial prefixes' rather than whether the content is actually harmful. This proxy could both over-count (benign refusals) and under-count (harmful responses without refusal phrasing) real attacks."
    405     },
    406     {
    407       "flag": "No limitations section",
    408       "detail": "The paper has no dedicated limitations or threats-to-validity discussion. Practical deployment concerns (latency overhead of 5.8x for full defense), generalization to unseen attack types, and threshold stability are not addressed as limitations."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Universal and transferable adversarial attacks on aligned language models",
    414       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    415       "year": 2023,
    416       "relevance": "Foundational GCG attack method used as the primary attack baseline in SPIN's evaluation."
    417     },
    418     {
    419       "title": "Jailbroken: How does LLM safety training fail?",
    420       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    421       "year": 2023,
    422       "relevance": "Analyzes failure modes of LLM safety alignment (conflicting objectives, mismatched generalization) that SPIN aims to defend against."
    423     },
    424     {
    425       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    426       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    427       "year": 2024,
    428       "arxiv_id": "2310.04451",
    429       "relevance": "Automated jailbreak generation method used as one of SPIN's evaluation attack types."
    430     },
    431     {
    432       "title": "CodeChameleon: Personalized encryption framework for jailbreaking large language models",
    433       "authors": ["Huijie Lv", "Xiao Wang", "Yuansen Zhang"],
    434       "year": 2024,
    435       "arxiv_id": "2402.16717",
    436       "relevance": "Encryption-based jailbreak method used as an evaluation attack type."
    437     },
    438     {
    439       "title": "Llama 2: Open foundation and fine-tuned chat models",
    440       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    441       "year": 2023,
    442       "relevance": "One of the two target models evaluated in SPIN's defense experiments."
    443     },
    444     {
    445       "title": "Training language models to follow instructions with human feedback",
    446       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    447       "year": 2022,
    448       "relevance": "Foundational RLHF method for LLM safety alignment that SPIN's defense complements."
    449     },
    450     {
    451       "title": "Certifying LLM safety against adversarial prompting",
    452       "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas"],
    453       "year": 2023,
    454       "relevance": "Related defense work that relies on external LLM verification, contrasted with SPIN's self-supervised approach."
    455     },
    456     {
    457       "title": "LLM self defense: By self examination, LLMs know they are being tricked",
    458       "authors": ["Mansi Phute", "Alec Helbling", "Matthew Hull"],
    459       "year": 2023,
    460       "relevance": "Related self-examination defense approach for LLM safety."
    461     },
    462     {
    463       "title": "Code Llama: Open foundation models for code",
    464       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    465       "year": 2023,
    466       "relevance": "LLM for code generation, cited in context of LLM capabilities that need safety protection."
    467     },
    468     {
    469       "title": "Direct preference optimization: Your language model is secretly a reward model",
    470       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    471       "year": 2023,
    472       "relevance": "Training-time alignment method (DPO) that SPIN's inference-time defense complements."
    473     },
    474     {
    475       "title": "Jailbreak and guard aligned language models with only few in-context demonstrations",
    476       "authors": ["Zeming Wei", "Yifei Wang", "Ang Li"],
    477       "year": 2024,
    478       "arxiv_id": "2310.06387",
    479       "relevance": "In-context demonstration attack (ICA) used as an evaluation attack type in SPIN."
    480     },
    481     {
    482       "title": "\"Do Anything Now\": Characterizing and evaluating in-the-wild jailbreak prompts on large language models",
    483       "authors": ["Xinyue Shen", "Zeyuan Chen", "Michael Backes"],
    484       "year": 2024,
    485       "relevance": "Characterizes DAN-style role-play jailbreak attacks that SPIN defends against."
    486     }
    487   ]
    488 }

Impressum · Datenschutz