ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30897B)


      1 {
      2   "paper": {
      3     "title": "Single Character Perturbations Break LLM Alignment",
      4     "authors": ["Leon Lin", "Hannah Brown", "Kenji Kawaguchi", "Michael Shieh"],
      5     "year": 2024,
      6     "venue": "AAAI Conference on Artificial Intelligence",
      7     "arxiv_id": "2407.03232",
      8     "doi": "10.48550/arXiv.2407.03232"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "observational"],
     13   "key_findings": "Appending a single space character to the end of LLM conversation templates reliably bypasses safety alignment in 6 of 8 tested open-source 7B models, achieving 100% attack success rate on Vicuna-7B and Guanaco-7B. The vulnerability stems from how subword tokenization causes single space tokens to appear predominantly before numbers in pre-training data, biasing models to generate lists rather than refusals. Llama-2 and Llama-3 are resistant, likely due to fine-tuning that teaches robustness to this perturbation. Fine-tuning on space-prepended data reduces Vicuna's ASR from 99% to 23%.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Code and data released at https://github.com/hannah-aught/space_attack, referenced in Section 3.1 footnote 3 and Appendix H."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "They use the publicly available AdvBench dataset (Zou et al., 2023) and C4 (Raffel et al., 2020), and state 'All code and data is available at' their GitHub repository (Appendix H)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Hardware is mentioned (single A100 40G GPU, Appendix H) but no requirements.txt, Dockerfile, or detailed library versions are provided in the paper. HuggingFace model links are given but software dependency specifications are absent."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. Methods are described in sufficient detail to understand the approach, and code is linked, but there is no 'Reproducing Results' section or explicit commands to run."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All ASR values in Tables 1, 5, 6, 7 are reported as point estimates (percentages) without confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are reported. Differences between models and tokens are compared by raw ASR percentages without any formal testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Table 1 reports both ASR and Base Rate for each model, providing context for effect magnitude (e.g., Vicuna-7B: 100% ASR vs 3% base rate, ChatGLM-6B: 62% vs 8%). Similar baseline context provided across all comparisons."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The study uses a 100-sample subset of AdvBench with no justification for why 100 samples is sufficient. No power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance or standard deviation is reported. While greedy decoding ensures deterministic results per input, no uncertainty estimates (e.g., bootstrap confidence intervals on ASR) are provided across the 100-sample test set."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Base rates (no perturbation) are reported alongside attack ASRs in Table 1. The GCG algorithm (Zou et al., 2023) is compared as an alternative attack method in Section 5."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The GCG adversarial suffix attack (Zou et al., 2023) was a contemporary state-of-the-art attack method. Models tested include recent releases (Llama-3, Mistral)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablations: different tokens (Table 5, Section 4), different insertion positions (Appendix C.2, Table 8), template mismatches (Appendix C.1), separator modifications (Table 9), and fine-tuning with space-prepended data (Section 9)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The primary evaluation uses only ASR (Attack Success Rate) based on absence of refusal keywords. Top-k overlap (Figure 4) and manual verification (74/80) are supplementary analyses, not alternative evaluation metrics for the main claims."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The first authors manually evaluated a random sample of 80 outputs (10 per model) to verify the accuracy of the automated refusal-word evaluation method, finding 74/80 agreement (Section 3.3)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The main attack (appending space) was discovered accidentally and not optimized on the AdvBench test set. The 100-sample AdvBench subset serves as a clean test set for the primary results. GCG search in Section 5 operates on the same data but is a secondary analysis."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per model (Table 1), per model size (7B vs 13B), per token type (Tables 5-7, Figure 3), per insertion position (Table 8), and per template pair (Figure 7)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Failure cases discussed: Llama-2 and Llama-3 resistance (Section 9), unusual outputs including language switching and gibberish (Appendix F.2, Table 12), and cases where non-refusal does not equal harmful output (6/80 manual check, Section 13)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results: attack fails on Llama-2 and Llama-3 (Table 1), GCG does not converge with suffix length 1 (Section 5), fine-tuning defense is imperfect ('still susceptible to attack by other punctuation tokens,' Section 9), and other insertion positions are ineffective (Table 8)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims that appending space causes 'the majority of models to generate harmful outputs with very high success rates' are supported by Table 1 showing 6/8 models affected. The abstract specifies 'eight open-source models' matching the study scope."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims tokenization contexts 'encourage models to generate lists when prompted, overriding training signals' (Section 1). While correlational evidence from pre-training data (Sections 7-8) and the LoRA experiment (Section 9) provide support, the full causal chain from tokenization context to alignment bypass is hypothesized rather than rigorously established. The theoretical analysis in Appendix E shows it is possible but does not demonstrate it is the actual mechanism."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title 'Single Character Perturbations Break LLM Alignment' implies broad applicability beyond what was tested: 8 open-source models in the 6B-13B range, English only, grey-box access only. While Section 13 acknowledges English-only and open-source-only scope, the title and abstract framing remain broader than the evidence supports."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper focuses on the tokenization-context hypothesis without substantively considering alternatives. Other possible explanations — such as RLHF training distribution effects, attention mechanism disruption (only a theoretical sketch in Appendix E), or fine-tuning data format sensitivities — are not explored as competing hypotheses."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly distinguishes between its proxy (non-refusal, measured by absence of refusal keywords) and the intended outcome (harmful output). Section 3.3 acknowledges 'it is likely that model responses that do not contain a refusal to a harmful query are harmful.' Section 13 discusses false positives. Manual verification of 80 samples (74/80 accuracy) validates the proxy."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact HuggingFace model identifiers are provided in Table 13 (Appendix H), e.g., 'lmsys/vicuna-7b-v1.5', 'meta-llama/Llama-2-7b-chat-hf', 'mistralai/Mistral-7B-Instruct-v0.2', with links to model pages."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full chat templates for all models are provided in Table 15 (Appendix I). The AdvBench prompts are from a public dataset, and the perturbation method (appending characters) is fully specified."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Greedy decoding stated for all experiments (Appendix H). GCG run for 100 steps with suffix length 1 (Section 5). LoRA fine-tuning uses 1,000 LIMA samples for 10 epochs (Section 9)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper tests direct model inference with modified chat templates."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data selection is documented: 100-sample subset of AdvBench harmful behaviors split (Section 3.1), 10,000 C4 samples for tokenization analysis (Section 7), 1,000 LIMA instructions for fine-tuning (Section 9). Token grouping into five categories is described (Section 7)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 13 'Limitations' provides substantive discussion covering language scope, evaluation methodology limitations, and inability to test closed-source models."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: English-only testing with different expected results for other languages (Section 13), evaluation accuracy at 74/80 with specific failure modes identified, and inability to test closed-source models due to template access requirements."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 13 explicitly states: 'we only examine English Language inputs,' 'We do not claim that these results generalize beyond English,' and 'we are unable to thoroughly test these results on closed-source models.' Section 3.5 bounds the setting to grey-box access."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "AdvBench and C4 are publicly available, all models are on HuggingFace, and code/data are released at github.com/hannah-aught/space_attack (Appendix H)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data sources clearly described: 100-sample subset from AdvBench harmful behaviors split (Section 3.1), 10,000 C4 samples for pre-training analysis (Section 7), 1,000 LIMA instructions for fine-tuning (Section 9)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks and datasets."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: prompts from AdvBench inserted into model-specific chat templates, optional character appended, model inference with greedy decoding, output checked against refusal word list (Appendix G), ASR computed."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements section discloses funding from the National Research Foundation Singapore (AISG Award No: AISG2-TC-2023-010-SGIL) and the Singapore Ministry of Education Academic Research Fund Tier 1 (Award No: T1 251RES2207)."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are affiliated with the National University of Singapore, clearly stated in the paper header. They are not evaluating their own product."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding is from Singapore government research agencies (NRF, MOE) which have no financial stake in LLM alignment vulnerability findings."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the eight models evaluated, despite testing on AdvBench which could appear in fine-tuning data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether AdvBench prompts appeared in any model's training or fine-tuning data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "AdvBench was published in 2023 (Zou et al.). Models trained or fine-tuned after this may have been specifically tuned to refuse these exact prompts, potentially inflating baseline refusal rates. No contamination analysis is provided."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. All experiments involve automated model inference."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. Section 14 discusses ethical considerations of disclosing the attack but no IRB approval was needed."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 14 (Appendix H) reports wall-clock time for each experiment type (e.g., 'Measure token ASR: 20min/token', 'GCG search: 3h/suffix'). Hardware specified as a single A100 40G GPU."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table 14 provides computation time for all experiment types. Hardware specified as 'a single A100 40G GPU' (Appendix H). LoRA fine-tuning took 12h, GSM8K evaluation 8h."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "The paper uses greedy decoding for all experiments (Appendix H: 'To ensure reproducibility, we use greedy decoding for all experiments'), which eliminates stochastic variation across seeds. Results are deterministic by design."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Greedy decoding implies single deterministic runs. The paper states explicitly: 'To ensure reproducibility, we use greedy decoding for all experiments' (Appendix H)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "While the main attack has no hyperparameters, choices such as GCG's 100 steps, LoRA's 10 epochs and 1,000 samples, and the 100-sample AdvBench subset size are not justified through systematic search or budget reporting."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The main finding (space token) was discovered accidentally, not through optimization. For GCG, the most frequently selected tokens are reported rather than cherry-picking the best single result (Section 5). The punctuation token search is exhaustive over Python's str.punctuation."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed despite many comparisons across 8+ models and dozens of token types. No multiple comparison correction is applied or discussed."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper does not discuss potential bias in evaluating their own attack method or how their experimental choices (model selection, prompt selection, evaluation metric) may favor their findings."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No formal comparison of attack effectiveness as a function of compute. Their attack is computationally trivial compared to GCG (appending a space vs. 3h optimization), but this contrast is not formally presented as a compute-performance tradeoff."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 3.3 discusses what ASR measures and its limitations. Section 13 acknowledges non-refusal may not equal harmful output. Manual verification of 80 samples (74/80 = 92.5% accuracy) validates the metric's construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are tested via direct inference with chat templates."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether models' training data included AdvBench prompts or similar harmful request datasets that could bias refusal behavior."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. The relationship between fine-tuning template format and evaluation template format is explored in Appendix C.1 but not framed as a leakage concern."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the 100 AdvBench prompts are independent or share structural similarities that could bias results."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Appending a single space to the end of LLM conversation templates reliably causes models to generate harmful outputs, achieving 100% ASR on Vicuna-7B and Guanaco-7B.",
    365       "evidence": "Table 1 shows ASR for 8 models at 7B: Vicuna (100%), Guanaco (100%), Falcon (84%), ChatGLM (62%), Mistral (58%), MPT (21%), with base rates far lower. Similar results for 13B: Vicuna-13B (72%), Guanaco-13B (93%). Llama-2 and Llama-3 unaffected (0% and 3%).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Single space tokens most commonly appear before numbers in tokenized pre-training data due to subword tokenization merging common tokens.",
    370       "evidence": "Figure 5 shows that for all model tokenizers, numerical tokens are the most likely to follow a single space token in 10,000 C4 samples (>90% for most tokenizers). Section 7.1 explains the subword tokenization mechanism causing this.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "The tokenization context of space causes models to shift toward generating numbers/lists rather than refusals, bypassing alignment.",
    375       "evidence": "Figure 6 shows models shift from generating non-numerical first tokens (6a) to generating numerical tokens when space is appended (6b). Appendix B shows 27% of ShareGPT model outputs contain list formatting. Correlation between pre-training context and model predictions demonstrated, but causal mechanism is inferred rather than proven.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Llama-2 and Llama-3 are resistant to the space attack due to their fine-tuning procedure.",
    380       "evidence": "Table 1 shows 0% ASR for Llama-2-7B/13B and 3% for Llama-3-8B. Section 9 fine-tunes Vicuna on space-prepended LIMA data, reducing ASR from 99% to 23%, providing partial support. However, the exact mechanism in Llama-2/3's training is not identified.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "The attack has minimal impact on reasoning performance, primarily affecting alignment.",
    385       "evidence": "Table 10 (Appendix D) shows Mistral-7B on GSM8K: strict-match drops from 38.67% to 36.32% with space appended, a small decrease. However, this is tested on only one model and one benchmark.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No uncertainty quantification",
    392       "detail": "ASR values are reported as point estimates without confidence intervals or statistical tests despite being computed from only 100 samples. With n=100, a 58% ASR has a 95% CI of roughly ±10pp, which could change interpretation of borderline results like MPT-7B (21% ASR vs 15% base rate)."
    393     },
    394     {
    395       "flag": "Small evaluation sample",
    396       "detail": "All main experiments use only 100 prompts from AdvBench with no justification for sample size. Manual evaluation covers only 80 samples (10 per model). These small samples limit precision and generalizability."
    397     },
    398     {
    399       "flag": "Evaluation proxy systematic overestimation",
    400       "detail": "The keyword-based evaluation (absence of refusal words = harmful) has a measured 7.5% false positive rate (6/80 in manual verification). This systematic overestimation of ASR is acknowledged but not corrected in reported numbers."
    401     },
    402     {
    403       "flag": "Title overgeneralizes findings",
    404       "detail": "'Single Character Perturbations Break LLM Alignment' implies broad applicability, but results are limited to 8 open-source models (6B-13B range), English only, with 2 models (Llama-2, Llama-3) showing resistance. Closed-source models were not tested."
    405     },
    406     {
    407       "flag": "Single benchmark for general performance claim",
    408       "detail": "The claim that the attack primarily affects alignment rather than general performance rests on a single experiment with Mistral-7B on GSM8K. No other models or reasoning benchmarks were tested."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    414       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    415       "year": 2023,
    416       "relevance": "Foundational adversarial suffix attack on LLM alignment; provides the GCG algorithm and AdvBench dataset used in this study."
    417     },
    418     {
    419       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    420       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    421       "year": 2023,
    422       "arxiv_id": "2307.02483",
    423       "relevance": "Analyzes failure modes of LLM safety training, providing theoretical framework for understanding alignment vulnerabilities."
    424     },
    425     {
    426       "title": "Constitutional AI: Harmlessness from AI Feedback",
    427       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    428       "year": 2022,
    429       "relevance": "Key alignment training method that this paper shows can be bypassed by trivial template perturbations."
    430     },
    431     {
    432       "title": "Deep reinforcement learning from human preferences",
    433       "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown", "Miljan Martic", "Shane Legg", "Dario Amodei"],
    434       "year": 2017,
    435       "relevance": "Introduces RLHF, the primary alignment technique that the space attack circumvents."
    436     },
    437     {
    438       "title": "\"Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models",
    439       "authors": ["Xinyue Shen", "Zeyuan Chen", "Michael Backes", "Yun Shen", "Yang Zhang"],
    440       "year": 2023,
    441       "relevance": "Characterizes jailbreak prompt attacks on LLMs, complementary attack methodology in the adversarial alignment space."
    442     },
    443     {
    444       "title": "Llama 2: Open foundation and fine-tuned chat models",
    445       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    446       "year": 2023,
    447       "arxiv_id": "2307.09288",
    448       "relevance": "One of the two robust models tested; its alignment approach resists the space attack, suggesting stronger fine-tuning practices."
    449     },
    450     {
    451       "title": "Multilingual Jailbreak Challenges in Large Language Models",
    452       "authors": ["Yue Deng", "Wenxuan Zhang", "Sinno Jialin Pan", "Lidong Bing"],
    453       "year": 2023,
    454       "relevance": "Demonstrates alignment is weaker in low-resource languages, complementary to this paper's finding about tokenization-based vulnerabilities."
    455     },
    456     {
    457       "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks",
    458       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"],
    459       "year": 2023,
    460       "relevance": "Proposes a defense against adversarial jailbreak attacks on LLMs, relevant to mitigating template-level perturbation attacks."
    461     },
    462     {
    463       "title": "Certifying LLM Safety against Adversarial Prompting",
    464       "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas", "Soheil Feizi", "Hima Lakkaraju"],
    465       "year": 2023,
    466       "relevance": "Proposes certified safety guarantees for LLMs against adversarial prompts, directly relevant to defending against template-level attacks."
    467     },
    468     {
    469       "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models",
    470       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    471       "year": 2023,
    472       "relevance": "Automated approach to finding natural-looking jailbreak prompts, related attack methodology."
    473     },
    474     {
    475       "title": "Coercing LLMs to do and reveal (almost) anything",
    476       "authors": ["Jonas Geiping", "Alex Stein", "Manli Shu", "Khalid Saifullah", "Yuxin Wen", "Tom Goldstein"],
    477       "year": 2024,
    478       "arxiv_id": "2402.14020",
    479       "relevance": "Studies glitch tokens and their role in exposing LLM attack surfaces, related to this paper's tokenization vulnerability analysis."
    480     },
    481     {
    482       "title": "Fishing for Magikarp: Automatically Detecting Under-trained Tokens in Large Language Models",
    483       "authors": ["Sander Land", "Max Bartolo"],
    484       "year": 2024,
    485       "relevance": "Methods for detecting under-trained tokens in LLMs, directly related to understanding how tokenization creates alignment vulnerabilities."
    486     },
    487     {
    488       "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    489       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    490       "year": 2023,
    491       "relevance": "LLM-based safety filter for conversations, a defense mechanism that could potentially detect outputs generated by the space attack."
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 2,
    497       "justification": "Engineers maintaining chat templates need awareness of this vulnerability; finding has direct implications for deployment practices but no tool is provided."
    498     },
    499     "surprise_contrarian": {
    500       "score": 3,
    501       "justification": "A single space character breaking RLHF alignment is highly surprising and counterintuitive, directly challenging assumptions about alignment robustness."
    502     },
    503     "fear_safety": {
    504       "score": 2,
    505       "justification": "Demonstrates a trivially simple attack that bypasses safety alignment on most tested models, raising concerns about deployed safety measures."
    506     },
    507     "drama_conflict": {
    508       "score": 1,
    509       "justification": "Raises questions about alignment robustness but does not directly target any company or create major controversy."
    510     },
    511     "demo_ability": {
    512       "score": 2,
    513       "justification": "Code released on GitHub; anyone with access to open-source models can reproduce the attack on standard hardware."
    514     },
    515     "brand_recognition": {
    516       "score": 1,
    517       "justification": "From NUS (respected university but not a top AI lab); involves well-known model families (Llama, Mistral) but is not from the model creators."
    518     }
    519   }
    520 }

Impressum · Datenschutz