ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (33628B)


      1 {
      2   "scan_version": 3,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "The Hidden Dimensions of LLM Alignment: A Multi-Dimensional Analysis of Orthogonal Safety Directions",
      6     "authors": ["Wenbo Pan", "Zhichao Liu", "Qiguang Chen", "Xiangyang Zhou", "Haining Yu", "Xiaohua Jia"],
      7     "year": 2025,
      8     "venue": "ICML 2025 (Proceedings of the 42nd International Conference on Machine Learning, PMLR 267)",
      9     "arxiv_id": "2502.09674",
     10     "doi": "10.48550/arXiv.2502.09674"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The abstract states: 'Code and artifacts are available at https://github.com/BMPixel/safety-residual-space.' A working URL is provided."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available datasets: STRONG REJECT (Souly et al., 2024) and OR-Bench (Cui et al., 2024). The dataset construction procedure from these public sources is documented in Appendix C.1, and the GitHub release includes 'artifacts.'"
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. Hardware is mentioned ('six A800 GPUs') but software dependencies and library versions are not specified."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are included in the paper. While training details and hyperparameters are provided in the appendix, there is no 'Reproducing Results' section or scripts walkthrough."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "All results (attack pass rates in Table 3, Strong Reject scores in Tables 5-7, prediction accuracy in Figure 3) are reported as point estimates without confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No statistical significance tests are used. Claims like 'Trigger Removal maintains approximately 40% effectiveness' and comparisons between methods are based on raw number comparisons without any statistical test."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported with baseline context: 'the average STRONGREJECT score decreased significantly from 0.65 to 0.05, while the refusal accuracy improved to 90%' (Section 3.2). Tables 3, 5-7 provide full attack pass rates and scores across conditions."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The dataset of 2600 training samples and 540 test samples (60 STRONG REJECT + 480 OR-Bench) is described but no justification for these sizes is given. No power analysis is discussed."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Results appear to be from single experimental runs. No standard deviation, variance, or spread measures are reported across multiple runs or seeds."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines are included: probe vectors and Best-of-N BASE for direction prediction (Figure 3), and six jailbreak methods (PAIR, ReNellm, GPTFuzz, GCG, CodeChameleon, Flip, Simple) as baselines against the trigger removal attack (Table 3)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include recent methods: PAIR (2023), FLIPAttack (2024), GPTFuzz (2023), GCG (2023), CodeChameleon (2024), and the refusal direction from Arditi et al. (2024)."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Intervention experiments serve as ablations: removing individual components (L14-C4, L14-C5, L14-C6, L25-C1) and measuring the effect on attack pass rates (Figure 4). Non-dominant suppression experiments (Figure 6) further isolate component contributions."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used: refusal accuracy, Strong Reject scores (Tables 5-7), attack pass rates (Table 3), prediction accuracy (Figure 3), MSE of residual space approximation (Table 4), and perplexity on Alpaca dataset (Figure 12)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All evaluation is automated. Harmfulness is scored by Strong Reject (LLM-based), refusal is classified automatically, and no human evaluation of model outputs is conducted."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Appendix C.1 describes a fixed test set 'completely disjoint from the training set': 60 STRONG REJECT samples with all jailbreak methods applied plus 480 OR-Bench harmless samples that 'do not overlap with the training set.'"
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by jailbreak method in Tables 3, 5, 6, and 7, showing how each method responds to different exposure levels. Figure 4 shows per-attack-type pass rates under interventions."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 7 discusses limitations: 'some directions occasionally flip between different layers, and feature directions cannot be extended indefinitely without degrading generation quality.' Figure 4 shows L14-C4 and L14-C5 'do not exhibit clear selectiveness.' Appendix C.4 shows perplexity degradation from interventions."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "DPO trained directly on the original model 'exhibits inconsistent performance in rejecting various forms of the PAIR method' (Section C.2). Perplexity increases from DPO (8.42 vs 7.10 base) are reported (Figure 12). The Best-of-N BASE baseline 'fail[s] to predict refusal behavior' (Section 4)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims are supported: multi-dimensional safety directions (Figure 2, SVD analysis), dominant direction governs refusal (Figure 3), non-dominant directions for hypothetical narrative/role-playing (Tables 1-2), trigger token removal bypasses safety (Table 3, Figure 6). The abstract appropriately bounds the case study to Llama 3 8B."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims are backed by intervention experiments: removing specific directions from activation space and measuring the effect on refusal (Figure 4, Equation 2). This controlled single-variable manipulation is adequate for the causal claims made (e.g., 'removing L14-C6 specifically ablates the model's ability to refuse PAIR prompts')."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title 'The Hidden Dimensions of LLM Alignment' and conclusion 'what LLMs learn from safety fine-tuning' speak broadly about LLMs, but experiments are limited to Llama 3.1 8B (primary), Llama 3.2 3B, and Ministral-8B-Instruct — all small models from two architecture families. The paper does not bound claims to these specific models in the title or conclusion."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not substantively discuss alternative explanations for its core results. Section 7 discusses limitations (non-linearity, data complexity) but does not consider alternative interpretations of the observed phenomena — e.g., whether the SVD directions could reflect artifacts of the decomposition method rather than true safety features, or whether the trigger removal success could have other explanations."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures refusal accuracy and Strong Reject scores and claims these measure safety alignment behavior. These are direct measurements of the claimed behaviors (refusal and harmfulness of outputs). The paper stays close to what it measures without inflating to broader proxy claims."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model versions are stated: 'Llama 3.1 8B Instruct', 'Llama 3.1 405B Instruct', 'Hermes-3-Llama-3.1-70B', 'Llama-3.2-3B-Instruct', and 'Ministral-8B-Instruct'. For open-source models, these names identify specific released weight sets."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt templates are provided in Appendix C.1: Prompt 1 (Simple Jailbreaking Prompt Template), Prompt 2 (Reject Prompt Template for Llama-3.1-405B), and Prompt 3 (Accept Prompt Template for Hermes-3-Llama-3.1-70B) with complete text."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section C.2 reports: 'learning rate of 1e−6, batch size 24, AdamW optimizer, maximum gradient norm 1.0, and training for 1 epoch' for SSFT. DPO: 'learning rate of 1e−6, batch size 24, AdamW optimizer, maximum gradient norm 1.0, DPO beta 0.1, with training conducted for 1 epoch.'"
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper directly analyzes model activations and performs standard fine-tuning."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix C.1 documents dataset construction in detail: how jailbreak methods are applied to STRONG REJECT samples, how preference pairs are created using Llama 405B and Hermes-3 for response sampling, the N-SHOT mechanism for dynamic dataset division, and the train/test split logic (Figure 7)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 7 (Discussion) serves as a substantive limitations discussion, covering non-linearity of some feature directions, data complexity effects, and the connection to the Linear Representation Hypothesis. There is also an Impact Statement discussing dual-use risks."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 7 discusses specific threats: 'some directions occasionally flip between different layers, and feature directions cannot be extended indefinitely without degrading generation quality' and 'as data complexity and model size increase, we expect the effective rank of the residual space will also increase, introducing more potential feature directions. While our framework's methodology remains applicable, interpreting these directions becomes more challenging.'"
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. Section 7 mentions future work directions and general limitations but lacks specific boundary statements about untested settings, excluded populations, or claims the authors are not making."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "While code and 'artifacts' are released at the GitHub repository, the paper does not explicitly confirm that raw data (model activations, constructed preference dataset, intermediate analysis outputs) is available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Appendix C.1 thoroughly describes data collection: jailbreak methods applied (PAIR, ReNellm, GPTFuzz, GCG, CodeChameleon, Flip, Simple), source datasets (STRONG REJECT, AdvBench, OR-Bench), sampling procedures for DPO pairs, and the complete N-SHOT division mechanism."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data comes from standard public benchmarks (STRONG REJECT, OR-Bench, AdvBench) and automated jailbreak generation."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The full data pipeline is documented in Appendix C.1 and Figure 7: from source datasets through jailbreak method application, response sampling with specific LLMs, to train/test split with exact counts at each stage (e.g., 1300 harmful + 1300 harmless training, 60 test STRONG REJECT + 480 test OR-Bench)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgments section states: 'This work was supported by HK RGC RIF (Research Impact Fund) R1012-21 and GRF grant (CityU 11211422).'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: City University of Hong Kong, Harbin Institute of Technology (two schools), and Microsoft. One co-author (Xiangyang Zhou) is at Microsoft."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The Hong Kong Research Grants Council (RGC) is an independent government funding body that has no financial stake in the outcome of this safety alignment research."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is included in the paper. One author is affiliated with Microsoft, which develops LLMs, but no explicit declaration of financial interests is made."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff date is stated for Llama 3.1 8B Instruct or any of the models used. The model's pre-training data period is not discussed."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "While the paper constructs disjoint train/test splits for its own fine-tuning (Appendix C.1), it does not discuss whether the pre-trained Llama 3.1 model's training data already contained information about the jailbreak techniques or STRONG REJECT examples being used for evaluation."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of whether STRONG REJECT or the jailbreak methods (PAIR, GCG, etc.) were in Llama 3.1's pre-training data. Since these are publicly available prior to Llama 3.1's release, contamination risk exists but is not addressed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, latency, or per-example cost is reported. The trigger removal attack requires up to 30 LLM calls per sample (comparable to TAP and PAIR) but the actual cost is not quantified."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "Hardware is mentioned ('All experiments used six A800 GPUs' in C.2) and training is 1 epoch, but total GPU hours, wall-clock training time, or total compute budget is not stated."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single experimental runs."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs producing the reported results is not stated. It is unclear whether results are from single runs or averaged."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Hyperparameters are reported (learning rate, batch size, etc.) but no search budget, search method, or number of configurations tried is stated."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No explanation of how the reported hyperparameter configuration was selected. The choice of layer 14 for analysis is motivated by the mid-early safety layer hypothesis but the specific hyperparameters appear unjustified."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper makes many comparisons across jailbreak methods, exposure levels, and model variants (Tables 3, 5-7) without any correction for multiple comparisons."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors introduce the Trigger Removal Attack and compare it against other jailbreak methods without acknowledging the inherent bias of evaluating their own method. They re-implement baseline jailbreaks without discussing potential self-comparison bias."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The trigger removal attack requires iterative LLM calls (up to 30 per sample) which is substantially more compute than simple attacks like GCG, but performance is not reported as a function of compute budget."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Strong Reject scores are used as the primary safety metric without questioning whether this LLM-based scoring actually measures harmfulness as claimed. No discussion of construct validity of the evaluation metrics."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is involved. The paper directly analyzes model activations and performs standard fine-tuning."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether Llama 3.1's training data includes information about the jailbreak methods or evaluation datasets used, despite these being publicly available before the model's release."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. For instance, the trigger removal attack uses PLRP analysis on the same model being evaluated, which could constitute a form of information leakage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "While train/test splits are described as disjoint, no analysis is provided of structural similarities between train and test examples (e.g., same jailbreak templates applied to different toxic prompts from the same distribution)."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method (canary strings, membership inference, n-gram overlap) is applied."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Safety-aligned behavior is jointly controlled by multi-dimensional orthogonal directions in activation space, not just a single direction.",
    363       "evidence": "SVD analysis of the safety residual space shows effective rank > 1 across layers (Figure 2). Best-of-N results demonstrate multiple orthogonal directions can predict refusal beyond the dominant component (Figure 3, Section 4).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "A dominant direction (first SVD component) governs the model's refusal behavior and achieves near-perfect prediction accuracy in later layers.",
    368       "evidence": "Figure 3 shows the dominant direction achieves ~0.95+ accuracy in predicting refusal behavior after layer 15, comparable to probe vectors trained on pairwise data.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Non-dominant directions represent distinct, interpretable features such as hypothetical narrative and role-playing patterns characteristic of specific jailbreak attacks.",
    373       "evidence": "PLRP token analysis (Tables 1-2) shows L14-C2 is activated by tokens like 'Imagine', 'fictional', 'hypothetical'; L14-C5 by 'ChatGPT' references; L14-C6 by 'Sure, I'm happy to help' with 'Imagine'. Intervention experiments (Figure 4) confirm L14-C6 specifically ablates PAIR refusal.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "The safety residual space is approximately low-rank linear, with a good affine approximation of the actual post-finetuning transformation.",
    378       "evidence": "Table 4 shows MSE/||Xu||² ratios on the order of 10⁻⁵ across layers. Figure 2 shows concentrated eigenvalues with long-tail spectrum distributions.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "The trigger removal attack maintains ~40% effectiveness after safety fine-tuning on 80 samples per jailbreak, while other attacks drop to near zero.",
    383       "evidence": "Table 3 shows trigger removal at 0.42 attack pass rate vs 0.03 (PAIR), 0.00 (ReNellm), 0.03 (GPTFuzz), 0.03 (GCG) at 80-shot exposure.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Safety training may model spurious correlations in certain jailbreak patterns, allowing out-of-domain attacks to bypass alignment.",
    388       "evidence": "The trigger removal attack's resilience to fine-tuning (Table 3), combined with the observation that non-dominant directions capture jailbreak-specific patterns rather than harmfulness (Section 5.1), supports this interpretation. However, this is a mechanistic interpretation rather than a directly tested claim.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "DPO training learns more divergent directions compared to SSFT, making it harder to identify dominant safety features when applied directly.",
    393       "evidence": "Figure 9 shows DPO trained directly on the original model exhibits inconsistent performance across PAIR variants. The effective rank continues increasing in final layers for DPO but decreases to 1 for SSFT (Figure 2). Combining SFT + DPO initialization improves consistency (Table 7).",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": ["benchmark-eval", "theoretical"],
    398   "key_findings": "Safety-aligned behavior in LLMs is controlled by multiple orthogonal directions in activation space, not just a single 'refusal direction.' A dominant SVD direction predicts refusal behavior, while non-dominant directions encode interpretable features tied to specific jailbreak patterns (e.g., hypothetical framing, role-playing). The paper introduces a Partial Layer-wise Relevance Propagation method to interpret these directions via token attribution. A trigger removal attack based on these insights maintains ~40% effectiveness even after safety fine-tuning on 80 exposure samples, suggesting safety alignment may learn spurious correlations with jailbreak-specific patterns rather than robust harmfulness detection.",
    399   "red_flags": [
    400     {
    401       "flag": "No uncertainty quantification",
    402       "detail": "All main results (attack pass rates, Strong Reject scores, prediction accuracy) are reported as single point estimates without error bars, confidence intervals, or variance across runs. This is particularly concerning given that DPO training showed 22% higher loss variance for Ministral."
    403     },
    404     {
    405       "flag": "Limited model diversity",
    406       "detail": "Primary experiments are on Llama 3.1 8B with additional tests only on Llama 3.2 3B and Ministral-8B-Instruct — all small models (~3-8B parameters). Claims about 'LLM alignment' broadly are not well supported by this narrow model selection."
    407     },
    408     {
    409       "flag": "Attack methodology confound",
    410       "detail": "The trigger removal attack uses Llama 3 405B for rephrasing, falling back to Hermes 3 405B (which has 'weaker safety guardrails') when the primary model refuses. This fallback introduces a confound: the attack's effectiveness may partly reflect the capabilities of the auxiliary model rather than the trigger removal insight."
    411     },
    412     {
    413       "flag": "Self-comparison bias",
    414       "detail": "The authors introduce the trigger removal attack and compare it favorably against existing jailbreak methods, but the comparison may be unfair: the trigger removal attack uses white-box access (PLRP analysis) to the victim model while baselines like PAIR and GPTFuzz are black-box attacks."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Refusal in language models is mediated by a single direction",
    420       "authors": ["Andy Arditi", "Oscar Obeso", "Aaquib Syed", "Daniel Paleka", "Nina Panickssery", "Wes Gurnee", "Neel Nanda"],
    421       "year": 2024,
    422       "arxiv_id": "2406.11717",
    423       "relevance": "Foundational work on single-direction refusal representations that this paper directly extends to multi-dimensional safety directions."
    424     },
    425     {
    426       "title": "Universal and transferable adversarial attacks on aligned language models",
    427       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    428       "year": 2023,
    429       "arxiv_id": "2307.15043",
    430       "relevance": "GCG jailbreak attack used as a baseline in the safety fine-tuning evaluation and dataset construction."
    431     },
    432     {
    433       "title": "Jailbreaking black box large language models in twenty queries",
    434       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"],
    435       "year": 2023,
    436       "arxiv_id": "2310.08419",
    437       "relevance": "PAIR attack method used as a primary baseline; the paper identifies a specific safety direction (L14-C6) that governs PAIR refusal."
    438     },
    439     {
    440       "title": "A wolf in sheep's clothing: Generalized nested jailbreak prompts can fool large language models easily",
    441       "authors": ["Peng Ding", "Jun Kuang", "Dan Ma", "Xuezhi Cao", "Yunsen Xian", "Jiajun Chen", "Shujian Huang"],
    442       "year": 2023,
    443       "arxiv_id": "2311.08268",
    444       "relevance": "ReNellm jailbreak attack used as a baseline in safety fine-tuning experiments."
    445     },
    446     {
    447       "title": "Direct preference optimization: Your language model is secretly a reward model",
    448       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D. Manning", "Stefano Ermon", "Chelsea Finn"],
    449       "year": 2024,
    450       "relevance": "DPO alignment method directly used for safety fine-tuning in the experiments; paper compares SSFT vs DPO residual spaces."
    451     },
    452     {
    453       "title": "A strongreject for empty jailbreaks",
    454       "authors": ["Alexandra Souly", "Qingyun Lu", "Dillon Bowen", "Tu Trinh", "Elvis Hsieh", "Sana Pandey", "Pieter Abbeel", "Justin Svegliato", "Scott Emmons", "Olivia Watkins", "Sam Toyer"],
    455       "year": 2024,
    456       "relevance": "STRONG REJECT benchmark and scoring used as the primary evaluation metric and data source for harmful prompts."
    457     },
    458     {
    459       "title": "What makes and breaks safety fine-tuning? A mechanistic study",
    460       "authors": ["Samyak Jain", "Ekdeep Singh Lubana", "Kemal Oksuz", "Tom Joy", "Philip H. Torr", "Amartya Sanyal", "Puneet K. Dokania"],
    461       "year": 2024,
    462       "arxiv_id": "2407.10264",
    463       "relevance": "Related mechanistic interpretability work on safety fine-tuning dynamics that motivates the safety residual space framework."
    464     },
    465     {
    466       "title": "Understanding jailbreak success: A study of latent space dynamics in large language models",
    467       "authors": ["Sarah Ball", "Frauke Kreuter", "Nina Rimsky"],
    468       "year": 2024,
    469       "arxiv_id": "2406.09289",
    470       "relevance": "Studies latent space dynamics during jailbreaks using supervised probing; this paper extends beyond single-direction probes."
    471     },
    472     {
    473       "title": "The geometry of refusal in large language models: Concept cones and representational independence",
    474       "authors": ["Thomas Wollschläger", "Jannik Elstner", "Simon Geisler", "Vincent Cohen-Addad", "Stephan Günnemann", "Johannes Gasteiger"],
    475       "year": 2025,
    476       "arxiv_id": "2502.17420",
    477       "relevance": "Concurrent work also finding safety features represented by a subspace of latent activations; provides complementary perspective on multi-dimensional safety."
    478     },
    479     {
    480       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    481       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik", "Blaine Nelson", "Hyrum Anderson", "Yaron Singer", "Amin Karbasi"],
    482       "year": 2023,
    483       "arxiv_id": "2312.02119",
    484       "relevance": "TAP iterative jailbreak method that the trigger removal attack is compared to in terms of computational cost (~30 vs ~35 attempts)."
    485     },
    486     {
    487       "title": "FLIPAttack: Jailbreak LLMs via Flipping",
    488       "authors": ["Yi Liu", "Xingyue He", "Mingyu Xiong", "Jinlan Fu", "Shuangqing Deng", "Bryan Hooi"],
    489       "year": 2024,
    490       "arxiv_id": "2410.02832",
    491       "relevance": "FLIPAttack jailbreak method used as a baseline in safety fine-tuning experiments."
    492     },
    493     {
    494       "title": "SafeDecoding: Defending against jailbreak attacks via safety-aware decoding",
    495       "authors": ["Zhangchen Xu", "Fengqing Jiang", "Luyao Niu", "Jinyuan Jia", "Bill Yuchen Lin", "Radha Poovendran"],
    496       "year": 2024,
    497       "arxiv_id": "2402.08983",
    498       "relevance": "Defense mechanism against jailbreaks via decoding-time modifications, representing an alternative approach to safety alignment."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 1,
    504       "justification": "Primarily a mechanistic interpretability framework; the trigger removal attack has some red-teaming utility but the main contribution is theoretical understanding."
    505     },
    506     "surprise_contrarian": {
    507       "score": 2,
    508       "justification": "Directly challenges the prevailing single-direction view of safety alignment (Arditi et al. 2024), showing safety is multi-dimensional with distinct interpretable features."
    509     },
    510     "fear_safety": {
    511       "score": 2,
    512       "justification": "Demonstrates a trigger removal attack resilient to safety fine-tuning (~40% effectiveness at 80-shot), suggesting fundamental limitations of current safety alignment approaches."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "Straightforward academic contribution with no controversial claims about companies or products."
    517     },
    518     "demo_ability": {
    519       "score": 2,
    520       "justification": "Code and artifacts released on GitHub (https://github.com/BMPixel/safety-residual-space), enabling reproduction of the analysis."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Published at ICML 2025 (top venue) and uses Llama models (well-known), but authors are from CityU/HIT, not a major AI lab. One co-author from Microsoft."
    525     }
    526   }
    527 }

Impressum · Datenschutz