ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34142B)


      1 {
      2   "paper": {
      3     "title": "SaRO: Enhancing LLM Safety through Reasoning-based Alignment",
      4     "authors": [
      5       "Yutao Mou",
      6       "Yuxiao Luo",
      7       "Shikun Zhang",
      8       "Wei Ye"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2504.09420",
     13     "doi": "10.48550/arXiv.2504.09420"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "SaRO, a two-stage framework combining reasoning-style warmup (RW) and safety-oriented reasoning process optimization (SRPO), consistently outperforms traditional safety alignment methods (SafetySFT, DPO) on safety benchmarks while reducing over-refusal and maintaining general capabilities. The paper shows that reasoning-based alignment enhances safety primarily through extended decoding-time reasoning rather than improved input semantic understanding, and that current open-source reasoning models (QwQ-32B, DeepSeek-R1) exhibit surprisingly poor safety performance compared to SaRO-aligned 7-8B models. The Shortest Rejection Sampling strategy can reduce reasoning tokens by ~40% without significant safety degradation.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states in Section 1 footnote: 'We release our dataset and code at https://github.com/MurrayTom/SaRO'."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The same footnote states dataset release at the GitHub URL. Additionally, training data is derived from publicly available datasets: Salad-Bench, OpenOrca, and BeaverTails."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Appendix D mentions '8 NVIDIA 80GB A800 GPUs' and decoding parameters (temperature 0.8, top-p 0.9), but no requirements.txt, Dockerfile, or library version specifications are provided. The information is insufficient to recreate the environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Appendix D provides training hyperparameters (learning rates, epochs) but no step-by-step reproduction instructions, README commands, or scripts to replicate experiments are described in the paper."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 1, 2, 7, 9, and 10 all report only point estimates (e.g., '0.33', '13.75') with no confidence intervals, error bars, or ± notation anywhere in the paper."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes numerous comparative claims ('significantly enhances', 'outperforms', 'consistently achieve lower ASR') without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports absolute differences with baseline context throughout. For example, Table 1 shows ASR dropping from 39.82% (SafetySFT) to 13.75% (SaRO) on WildJailbreak. Table 6 reports growth rates (e.g., '+47.53%'). The reader can assess the magnitude of effects."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is provided for the size of training datasets (e.g., why 580 queries for OP-COT, why 8,000 from OpenOrca), evaluation sample sizes, or the number of evaluation benchmarks selected."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. For HumanEval they sample 20 times for pass@1 but report only the average without variance. MMLU and MATH use deterministic decoding (do_sample=False) but safety evaluations use temperature=0.8 with no variance reported."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5 describes four baseline methods: Vanilla SFT, SafetySFT, SafetySFT+DPO, and RW+rDPO. Table 2 additionally compares against open-source models (LLAMA3-8B-Instruct, Qwen2-7B-Instruct, DeepSeek-R1 variants, QwQ-32B)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 compares against contemporary models including QwQ-32B, DeepSeek-R1-Distill variants, Qwen2.5-7B-Instruct, and LLAMA3.1-8B-Instruct, all from 2024-2025."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 1 presents a systematic ablation: base model → +SFT → +SafetySFT → +SafetySFT+DPO → +RW → +RW+rDPO → +RW+SRPO (SaRO), clearly showing each component's contribution. Section 6.4 also ablates the SRS strategy."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses ASR (Attack Success Rate) across multiple safety benchmarks, ERR (Error Refusal Rate) for over-refusal, ACC for MMLU and MATH, and pass@1 for HumanEval. MT-Bench and average tokens are also reported in Table 7."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluation of model outputs is automated: LlamaGuard2 judges safety of responses, GPT-4o judges over-refusal classification, and automated metrics for general benchmarks. The human evaluation in Appendix F assesses synthetic training data quality, not system outputs."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Training uses Salad-Bench MCQ subset and BeaverTails for data construction. Evaluation uses separate benchmarks: ALERT, WildJailbreak, SG-Bench (distinct from training sources), XSTest, MMLU, MATH, and HumanEval. Training and test sets are clearly separated."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Tables 1 and 2 break down results across three safety dimensions (disallowed content, jailbreak attacks by type, over-refusal) and three general capability dimensions (knowledge, mathematics, coding), rather than reporting a single aggregate score."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6.1 and Table 3 show that direct CoT prompting can make models MORE vulnerable to jailbreak attacks. Figure 3 shows that reasoning-based alignment worsens confusion between benign and harmful instructions at the embedding level. The Limitations section discusses reasoning latency costs."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 3 reports that direct CoT prompting increases vulnerability to jailbreak attacks for Qwen2.5-7B-Instruct (from 47.65% to 61.45% on SG-Bench PAIR). Section 6.1 shows RW worsens embedding-level semantic confusion (Figure 3b). These are genuine negative findings."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 'superiority of SaRO over traditional alignment methods,' which is supported by Tables 1 and 2 showing consistent improvements across safety benchmarks. Claims about addressing over-alignment and under-generalization are supported by XSTest (over-refusal) and jailbreak attack results."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims like 'SaRO enhances safety' and 'SRPO promotes reflection.' The ablation design in Table 1 (controlled addition of components: RW, rDPO, SRPO) provides adequate single-variable manipulation for these causal claims. Table 4 demonstrates that reasoning processes guide safer generation through controlled experiments."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Enhancing LLM Safety through Reasoning-based Alignment' without qualification. Primary experiments use 7-8B models. While Appendix E tests Qwen2.5-14B and LLAMA3-70B (with LoRA), the paper does not bound its claims to the tested scales or architectures. The abstract says 'LLMs' generically."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 6.1 substantively explores three alternative explanations for why reasoning-based alignment works: (1) it's not about improved embeddings (Figure 3), (2) it's not achievable through simple CoT prompting (Table 3), (3) safety reasoning guides autoregressive generation (Table 4). This goes beyond generic discussion."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper uses LlamaGuard2's binary safe/unsafe judgment as a proxy for 'safety' and reports ASR as the primary metric. There is no discussion of whether LlamaGuard2's judgments accurately capture actual safety, the gap between automated safety classification and real-world harm, or what 'safety' actually entails beyond benchmark performance."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Models are referred to as 'LLAMA3-8B', 'Qwen2-7B', 'Qwen2.5-7B', 'GPT-4o' without exact version identifiers or snapshot dates. No API version or model checkpoint identifiers are provided."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figures 6 and 7 in the appendix provide the complete prompt templates used for data synthesis in both the RW and SRPO stages, including the prompt for reasoning step decomposition and stepwise reflection. The actual prompt text is provided, not just natural language descriptions."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Appendix D reports: learning rate 1e-5 for RW (3 epochs), 1e-6 for SRPO (1 epoch), temperature 0.8, top-p 0.9 for evaluation. HumanEval uses temperature 0.8 with 20 samples. MMLU and MATH use do_sample=False."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. SaRO is a training method (SFT + DPO) applied to base LLMs, not an agentic system with tools, memory, or retry logic."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Appendix A.1 documents the data pipeline: Salad-Bench MCQ → GPT-4o reasoning generation → manual removal of 15 unsafe queries → 1,905 samples → augmentation with 500 MCQ/judgment samples → 8,000 OpenOrca samples → final RIT-D of 10,505. For OP-COT: 580 queries from BeaverTails → GPT-4o/Qwen2.5-72B generation → 2,188 preference pairs. Table 8 provides exact counts."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "A dedicated 'Limitations' section follows the conclusion, discussing two specific limitations: (1) reasoning latency (35-70% token increase) and (2) bias from GPT-4o synthetic data."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The Limitations section identifies specific threats: SaRO's 35-70% token overhead compared to conventional methods (quantified in Table 7), and the risk that GPT-4o may introduce 'harmful tendencies or hallucinations in generated reasoning processes.' These are specific to this study."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. It mentions future directions (adaptive reasoning length, alternative data generation methods, other optimization algorithms) but does not explicitly bound the scope—e.g., it does not state that results may not generalize to models beyond the tested families or to safety domains not covered by the evaluation benchmarks."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The GitHub repository (https://github.com/MurrayTom/SaRO) claims to release the dataset. Training data is built from publicly available sources (Salad-Bench, OpenOrca, BeaverTails). Evaluation uses publicly available benchmarks."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Appendix A.1 describes data construction in detail: seed datasets, GPT-4o prompting strategy, few-shot approach with Qwen2.5-72B for unsafe reasoning, quality filtering with LlamaGuard2. Table 8 provides exact sample counts for each dataset component."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants in the core study. Data sources are standard public benchmarks (Salad-Bench, BeaverTails, OpenOrca) and GPT-4o synthesis. The 3 undergraduate students in Appendix F performing quality verification are not study participants."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The full pipeline is documented in Section 3 and Appendix A.1 with counts at each stage: Salad-Bench MCQ (1,920) → remove 15 unsafe → 1,905 → add 500 augmented → add 8,000 OpenOrca → RIT-D (10,505). BeaverTails (30,000) → select 580 queries → OP-COT (2,188) → decomposition → reflection → PP-COT (11,598). Automated filtering: 1.5% flagged by LlamaGuard2 and removed."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No acknowledgments section mentioning funding sources is present in the paper. The authors are from Peking University's National Engineering Research Center for Software Engineering, but no grants or sponsors are listed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: all from 'National Engineering Research Center for Software Engineering, Peking University, China.' The paper does not evaluate a product from the authors' institution, so no additional conflict disclosure is needed."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Since funding is not disclosed, independence cannot be assessed. The authors are from an academic research center, which suggests likely independent funding, but the absence of any funding statement means this cannot be verified."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial disclosure is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper uses LLAMA3-8B, Qwen2-7B, Qwen2.5-7B, and other pre-trained models evaluated on MMLU, MATH, and HumanEval without stating any training data cutoff dates for these models."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether MMLU, MATH, or HumanEval test examples appeared in the pre-training data of LLAMA3 or Qwen2 models. No overlap analysis is performed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "HumanEval (2021), MMLU (2020), and MATH (2021) were all published before the training data collection for LLAMA3 and Qwen2. No discussion of contamination risk for these benchmarks."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study. All evaluation is automated using LlamaGuard2, GPT-4o as judge, and automated benchmark metrics."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The Ethics Statement discusses access restrictions to harmful content but does not mention IRB approval, which is appropriate given the study design."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table 7 reports average tokens per output for each method (e.g., SaRO produces ~422 tokens vs ~155 for SafetySFT on LLAMA3). Section 6.4 explicitly discusses the efficiency-effectiveness trade-off and introduces SRS to reduce reasoning token overhead."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Appendix D states '8 NVIDIA 80GB A800 GPUs' as hardware but does not report total GPU hours, training wall-clock time, or total computational budget for the experiments."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No multi-seed results are reported. Tables show single-run numbers for safety evaluations (which use stochastic decoding at temperature 0.8). No seed sensitivity analysis is performed."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "For HumanEval, 20 samples are stated (Appendix B.2). For MMLU/MATH, do_sample=False implies deterministic single runs. However, for the main safety evaluations (the paper's core contribution), which use temperature=0.8, the number of runs is not stated."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search budget is reported. The paper uses specific learning rates (1e-5, 1e-6) and DPO beta values without explaining how these were selected or how many configurations were tried."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper presents results for the chosen configuration without explaining how hyperparameters were selected. No validation set selection process is described."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper compares multiple methods across many benchmarks (7+ safety benchmarks, 3 general benchmarks) without any multiple comparison correction or family-wise error rate adjustment."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement their own baselines (Vanilla SFT, SafetySFT, SafetySFT+DPO) and compare against their own method without acknowledging the inherent bias of evaluating one's own system. No independent evaluation is performed."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Table 7 explicitly compares performance (safety + MT-Bench scores) against efficiency (average output tokens) across methods. Section 6.4 discusses the trade-off and introduces SRS to reduce compute while maintaining performance."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper uses LlamaGuard2 as the safety judge without discussing whether its binary classifications accurately capture actual safety. No discussion of whether ASR as measured by an automated judge reflects real-world safety alignment quality."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. SaRO is a training method, not an agentic system. Model comparisons use the same evaluation pipeline."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. MMLU (2020), MATH (2021), and HumanEval (2021) all predate the training of LLAMA3 and Qwen2, meaning these benchmarks could have been in pre-training data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks answer information. Safety evaluation uses LlamaGuard2 on the final response only (excluding reasoning for SaRO), but there is no analysis of whether this evaluation protocol itself affects the results."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Training uses Salad-Bench MCQ subset while evaluation includes Salad-Bench jailbreak subset. Although these are different subsets, no analysis is provided of whether shared structure or content between subsets could inflate results."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection or prevention method is used (no canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines)."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "SaRO consistently outperforms traditional alignment methods (SafetySFT, SafetySFT+DPO) on safety benchmarks across multiple model families.",
    370       "evidence": "Table 1 shows SaRO achieves the lowest ASR across all safety benchmarks for both LLAMA3-8B and Qwen2-7B. For example, on WildJailbreak: LLAMA3 SafetySFT+DPO 36.20% → SaRO 13.75%; Qwen2 SafetySFT+DPO 31.80% → SaRO 13.30%. Table 9 (Appendix E) confirms this for Mistral-7B, Qwen2.5-14B, and LLAMA3-70B.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Reasoning-based alignment reduces over-refusal compared to traditional safety alignment methods.",
    375       "evidence": "Table 1 XSTest ERR: LLAMA3 SafetySFT 14.57% vs SaRO 7.39%; Qwen2 SafetySFT 9.57% vs SaRO 5.22%. Table 2 shows SaRO models generally achieve lower ERR than their instruct counterparts.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "SaRO does not degrade general capabilities (knowledge, mathematics, coding) compared to traditional alignment methods.",
    380       "evidence": "Table 1 shows SaRO models perform slightly better or comparably on MMLU, MATH, and HumanEval compared to SafetySFT+DPO baselines. For Qwen2: MMLU 68.40 vs 68.50, MATH 51.80 vs 50.00, HumanEval 67.80 vs 47.50.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Reasoning-based alignment enhances safety primarily through extended decoding-time reasoning, not through improved semantic embeddings of inputs.",
    385       "evidence": "Section 6.1, Figure 3 shows RW-aligned models have worsened semantic embedding confusion between benign and harmful inputs. Table 4 shows that providing safety reasoning context to unaligned base LLMs reduces ASR from 100% to 2.5-16.5%.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Direct CoT prompting does not achieve the same safety improvements as reasoning-based alignment and can even increase vulnerability.",
    390       "evidence": "Table 3 shows that adding CoT prompting to Qwen2.5-7B-Instruct increases jailbreak ASR from 47.65% to 61.45% on SG-Bench (PAIR). LLAMA3 over-refusal worsens from 15.87% to 22.17%.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Current open-source reasoning models (QwQ-32B, DeepSeek-R1) have poor safety performance compared to SaRO-aligned models of the same or smaller scale.",
    395       "evidence": "Table 2 shows DeepSeek-R1-Distill-Llama-8B has 84.65% ASR on SG-Bench (PAIR) vs SaRO's 27.81%. QwQ-32B has 50.13% on SG-Bench (PAIR). DeepSeek-R1-Distill-Qwen-7B scores 84.23% on PAIR vs SaRO's 23.20%.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "SRPO with process-level preference data promotes more frequent safety reflection and self-correction in reasoning chains compared to outcome-level DPO.",
    400       "evidence": "Table 5 shows SaRO-aligned models produce safety policy mentions in 196-198/200 prompts vs 154-189 for RW or rDPO. Figure 4 shows SRPO continuously increases reward margins between safe and unsafe responses while rDPO and DPO plateau.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or variance reporting",
    407       "detail": "All main results (Tables 1, 2, 7, 9, 10) report only point estimates with no variance, standard deviation, or confidence intervals, despite safety evaluations using stochastic decoding (temperature=0.8). The reproducibility of the reported numbers cannot be assessed."
    408     },
    409     {
    410       "flag": "No statistical significance tests",
    411       "detail": "The paper makes numerous claims that SaRO 'significantly' outperforms baselines and 'consistently achieves lower ASR' without any formal statistical tests. All comparisons are based on comparing raw numbers."
    412     },
    413     {
    414       "flag": "Self-comparison bias in baselines",
    415       "detail": "The authors implement their own versions of SafetySFT and SafetySFT+DPO baselines rather than using established implementations. The self-implemented baselines may systematically underperform the authors' method. No independent evaluation is performed."
    416     },
    417     {
    418       "flag": "Automated safety judge as sole evaluator",
    419       "detail": "All safety evaluation relies on LlamaGuard2 as the binary judge. No human evaluation of model safety outputs is performed. LlamaGuard2's alignment with actual safety could systematically favor or disfavor certain response styles (e.g., reasoning-augmented responses)."
    420     },
    421     {
    422       "flag": "GPT-4o dependency for data synthesis",
    423       "detail": "The entire training data pipeline depends on GPT-4o for generating reasoning chains. This introduces a dependency on a proprietary model whose behavior may change, and the synthesized data may carry GPT-4o's biases. The authors acknowledge this in the Limitations section."
    424     },
    425     {
    426       "flag": "Potential data overlap between training and evaluation",
    427       "detail": "Training uses Salad-Bench MCQ subset while evaluation includes Salad-Bench jailbreak subset. Although these are different subsets of the same benchmark, shared structure and content could inflate results. No analysis of this overlap is provided."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Deliberative alignment: Reasoning enables safer language models",
    433       "authors": ["Melody Y Guan", "Manas Joglekar", "Eric Wallace", "Saachi Jain"],
    434       "year": 2024,
    435       "arxiv_id": "2412.16339",
    436       "relevance": "Directly comparable approach to SaRO for aligning reasoning models with safety policies, using SFT+RL on OpenAI's O-series models."
    437     },
    438     {
    439       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    440       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    441       "year": 2025,
    442       "arxiv_id": "2501.12948",
    443       "relevance": "Major open-source reasoning model evaluated in this paper, shown to have poor safety performance compared to SaRO-aligned models."
    444     },
    445     {
    446       "title": "Direct preference optimization: Your language model is secretly a reward model",
    447       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    448       "year": 2024,
    449       "relevance": "Core optimization method (DPO) used in SaRO's SRPO stage for safety-oriented preference optimization."
    450     },
    451     {
    452       "title": "Safe RLHF: Safe reinforcement learning from human feedback",
    453       "authors": ["Josef Dai", "Xuehai Pan", "Ruiyang Sun"],
    454       "year": 2023,
    455       "arxiv_id": "2310.12773",
    456       "relevance": "Decoupled helpfulness and harmlessness objectives for safety alignment, baseline approach that SaRO aims to improve upon."
    457     },
    458     {
    459       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    460       "authors": ["Jiaming Ji", "Mickel Liu", "Josef Dai"],
    461       "year": 2024,
    462       "relevance": "Source dataset for SaRO's preference data construction (OP-COT and PP-COT built from BeaverTails queries)."
    463     },
    464     {
    465       "title": "Salad-bench: A hierarchical and comprehensive safety benchmark for large language models",
    466       "authors": ["Lijun Li", "Bowen Dong", "Ruohui Wang"],
    467       "year": 2024,
    468       "arxiv_id": "2402.05044",
    469       "relevance": "Source of training data (MCQ subset) and safety evaluation benchmark used in this study."
    470     },
    471     {
    472       "title": "SafeChain: Safety of language models with long chain-of-thought reasoning capabilities",
    473       "authors": ["Fengqing Jiang", "Zhangchen Xu", "Yuetai Li"],
    474       "year": 2025,
    475       "arxiv_id": "2502.12025",
    476       "relevance": "Related work studying safety risks of long CoT reasoning in LLMs, finding that long reasoning can lead to more harmful outputs."
    477     },
    478     {
    479       "title": "WildTeaming at scale: From in-the-wild jailbreaks to (adversarially) safer language models",
    480       "authors": ["Liwei Jiang", "Kavel Rao", "Seungju Han"],
    481       "year": 2024,
    482       "arxiv_id": "2406.18510",
    483       "relevance": "Source of the WildJailbreak evaluation dataset used for safety testing in this study."
    484     },
    485     {
    486       "title": "Rule based rewards for language model safety",
    487       "authors": ["Tong Mu", "Alec Helyar", "Johannes Heidecke"],
    488       "year": 2024,
    489       "arxiv_id": "2411.01111",
    490       "relevance": "Rule-based reward model approach used in GPT-4 safety alignment, a baseline paradigm that SaRO contrasts with."
    491     },
    492     {
    493       "title": "Step-DPO: Step-wise preference optimization for long-chain reasoning of LLMs",
    494       "authors": ["Xin Lai", "Zhuotao Tian", "Yukang Chen"],
    495       "year": 2024,
    496       "arxiv_id": "2406.18629",
    497       "relevance": "Related work on step-level preference optimization for reasoning, providing methodological foundation for SaRO's stepwise reflection approach."
    498     },
    499     {
    500       "title": "XSTest: A test suite for identifying exaggerated safety behaviours in large language models",
    501       "authors": ["Paul Röttger", "Hannah Rose Kirk", "Bertie Vidgen"],
    502       "year": 2023,
    503       "arxiv_id": "2308.01263",
    504       "relevance": "Evaluation benchmark for measuring over-refusal/over-alignment in safety-tuned LLMs."
    505     },
    506     {
    507       "title": "SG-Bench: Evaluating LLM safety generalization across diverse tasks and prompt types",
    508       "authors": ["Yutao Mou", "Shikun Zhang", "Wei Ye"],
    509       "year": 2024,
    510       "relevance": "Safety generalization benchmark from the same research group, used as a key evaluation dataset for jailbreak resistance."
    511     },
    512     {
    513       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    514       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen"],
    515       "year": 2024,
    516       "arxiv_id": "2310.04451",
    517       "relevance": "Automated jailbreak attack method used to generate adversarial prompts in the SG-Bench evaluation."
    518     }
    519   ],
    520   "engagement_factors": {
    521     "practical_relevance": {
    522       "score": 2,
    523       "justification": "Provides a usable training framework with released code and data, but requires significant GPU resources and expertise to apply."
    524     },
    525     "surprise_contrarian": {
    526       "score": 1,
    527       "justification": "Confirms that reasoning helps safety (expected direction) rather than challenging conventional wisdom; the finding that CoT can worsen safety is mildly surprising."
    528     },
    529     "fear_safety": {
    530       "score": 2,
    531       "justification": "Demonstrates that mainstream open-source reasoning models (DeepSeek-R1, QwQ) have high jailbreak vulnerability, raising concerns about deployed reasoning models."
    532     },
    533     "drama_conflict": {
    534       "score": 1,
    535       "justification": "Implicitly critiques DeepSeek-R1 and QwQ safety, but does not frame it as controversy; no dramatic claims or accusations."
    536     },
    537     "demo_ability": {
    538       "score": 1,
    539       "justification": "Code released on GitHub but requires multi-GPU training setup; not a pip-installable tool or live demo."
    540     },
    541     "brand_recognition": {
    542       "score": 1,
    543       "justification": "From Peking University, a well-known institution but not a prominent AI safety lab. No association with major AI companies."
    544     }
    545   }
    546 }

Impressum · Datenschutz