ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26798B)


      1 {
      2   "paper": {
      3     "title": "Efficient Switchable Safety Control in LLMs via Magic-Token-Guided Co-Training",
      4     "authors": ["Jianfeng Si", "Lin Sun", "Zhewen Tan", "Xiangzheng Zhang"],
      5     "year": 2025,
      6     "venue": "AAAI 2026 Special Track on AI Alignment (extended version)",
      7     "arxiv_id": "2508.14904",
      8     "doi": "10.48550/arXiv.2508.14904"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Magic-token-guided co-training enables a single LLM to support three switchable safety behaviors (positive, negative, rejective) within a single SFT stage, matching SFT+DPO two-stage pipeline performance. The 8B model achieves 97.55 avg safety score on English benchmarks, surpassing DeepSeek-R1 (671B) on safety. The co-training induces a measurable Safety Alignment Margin (SAM=0.131) showing well-separated behavioral pathways in first-token logit space. Multi-policy extension fuses English and Chinese safety norms with culture-specific control tokens.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub link provided: https://github.com/Qihoo360/LLMs-Safety-Control. Also a safer variant TinyR1-S-8B is mentioned as released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "EN-ALIGN and ZH-ALIGN datasets generated via self-distillation are described but no download link is provided. The in-house Chinese evaluation datasets (ZH-Red, ZH-Red attack) are not released. Some evaluation datasets are public (S-Eval, HarmBench, XSTest)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions ModelScope/ms-swift framework and 8 NVIDIA H800 GPUs but does not provide requirements.txt, library versions, or environment setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link exists but the paper itself does not describe how to replicate the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 2-5 are reported as point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims 'outperforms' various baselines based on comparing raw numbers without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with context: e.g., 'baselines experience an average performance drop of 21.5% under attack, ours declines by 3.8% only' (Figure 1), and full baseline-vs-method scores in Table 2 allow effect size computation."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for evaluation dataset sizes (e.g., 300 HarmBench, 1000 S-Eval samples). No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures reported across runs. Results appear to be single-run numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 2 includes multiple baselines: Qwen3-8B, DeepSeek-R1-8B, Nemotron-8B, Llama3-8B, Qwen3-32B, DeepSeek-R1 (671B), plus ablation variants (SPos, TPos, TPos/DPO)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent models: Qwen3-8B/32B, DeepSeek-R1-0528, Llama-3.1-Nemotron-Nano-8B, and comparison with DPO methods. These are contemporary as of 2025."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 includes systematic ablations: SPos vs TPos (single vs multi-directional distillation), TPos vs TPos/DPO (with/without DPO stage), MTC vs TPos (co-training vs single-behavior). Table 4 ablates SAM across model variants."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Primary metric is Constructive Safety Score. Extended evaluation in Appendix C adds Safety Score (S), Helpfulness Score (H), and CoSA-Score (C). Also SAM metric in Table 4."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Manual review of 2,540 samples to validate the safety evaluation classifier accuracy (94.7%/99.6%/98.9% per-class accuracy, 97.5% overall). This is human evaluation of the evaluation tool, not the system outputs directly, but does involve human judgment on system outputs."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Evaluation uses separate benchmark datasets (HarmBench, S-Eval, XSTest, NVIDIA Aegis 2.0) that are distinct from the training data. Training uses Llama-Nemotron chat data and self-distilled safety data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 provides per-dataset breakdowns across 5 English and 4 Chinese evaluation sets. Table 3 breaks down behavioral controllability per mode per dataset. Table 5 provides per-benchmark S/H/C scores."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 3 discusses neg mode producing pos outputs in 31.8% of cases (50% on XSTest), with analysis attributing this to safe prompts where the model 'appropriately avoids introducing risks.' MTC/MP rand and no-token failure modes are also tested."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The neg mode achieves only 67.8% negative response rate (not perfect control). MTC/MP rand and MTC/MP no show degraded performance. TinyR1-S-8B/adh mode shows negative CoSA scores, acknowledged as 'prioritizes usefulness at the cost of safety.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of matching SFT+DPO quality (97.55 vs 97.58, Table 2), surpassing DeepSeek-R1 671B in safety (97.55 vs 87.45), and reduced training complexity (single SFT stage) are all supported by Table 2 results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'multi-directional distillation improves pos quality' are supported by controlled ablations (SPos vs TPos, same data pipeline, same base model). The ablation design holds confounds constant (same CHAT data, same base model)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract claims a 'scalable, efficient, and highly controllable solution for LLM content safety' but experiments are conducted only on Qwen3-8B. No evidence of scalability to other model families or sizes. Title claims 'LLMs' generally."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the results. Could the safety improvement come from simply training on more safety data (3x triplets)? Could the in-house evaluator favor the in-house model? These confounds are not discussed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper frames the Constructive Safety Score as measuring 'safety alignment quality' without discussing the gap between automated classifier scores and actual content safety. The 0/1/2 scoring system conflates refusal with constructive engagement but does not discuss this design choice's limitations."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Qwen3-8B is specified as the base model. Specific model identifiers given for baselines: DeepSeek-R1-0528-Qwen3-8B, Llama-3.1-Nemotron-Nano-8B-v1, Meta-Llama-3.1-8B-Instruct."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix A provides the full prompt template for multi-directional self-distillation. Appendix B provides the helpfulness evaluation prompt. The magic token strings are provided (rfcd9lbo, 8v4v5sa3, q787fvif)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 reports: SFT 5 epochs, lr=1e-5, warmup=0.01; DPO 1 epoch, lr=1e-6, β=0.1; inference temperature=0.9, top_p=0.6, max_tokens=4k."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The method is a single-stage SFT training approach with magic tokens in system prompts."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 describes data construction: EN/CHAT from Llama-Nemotron (39,792 pairs), EN/SAFETY from 11,010 prompts yielding 10,977 per behavior, ZH/CHAT 20,000 pairs, ZH/SAFETY 16,521 per behavior. Think/no-think duplication process described."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The conclusion mentions 'mitigating potential misuse of neg modes' as future work but does not substantively discuss limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity discussed. The reliance on a single base model, single in-house evaluator, and potential evaluator bias are not addressed."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries stated. The paper does not clarify that results are specific to Qwen3-8B or that the approach is untested on other architectures or model sizes."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw model outputs and evaluation scores are not available for independent verification. Only aggregated scores are reported in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes the data collection: sources for CHAT data, self-distillation process for SAFETY data with specific policy frameworks (AEGIS 2.0, Chinese regulatory taxonomy). Section 4.3 describes evaluation datasets."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants recruited. Evaluation data comes from standard benchmarks and in-house constructed datasets."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: base model generates triplets under policy-guided prompts → think/no-think duplication → mixing with CHAT data → SFT training. Sample counts provided at each stage (11,010 → 10,977 per behavior)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information disclosed. Authors are from Qiyuan Tech (a subsidiary of Qihoo 360) but no funding statement is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly stated: all from 'Qiyuan Tech, Beijing, China.' The GitHub repo is under Qihoo360."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Qiyuan Tech/Qihoo 360 is the employer and has commercial interest in demonstrating effective safety controls for their LLM products. The funder is not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement. Authors from a commercial AI company evaluating their own safety framework without declaring financial interests."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The training cutoff of Qwen3-8B base model is not stated. This matters because the model is fine-tuned and evaluated on benchmarks that may overlap with pre-training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether Qwen3-8B's pre-training data overlaps with evaluation benchmarks (HarmBench, S-Eval, XSTest)."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HarmBench (2024), S-Eval, and XSTest were published before Qwen3-8B training, creating contamination risk. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or tokens consumed are reported despite claims of 'reducing deployment costs.'"
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is mentioned (8 NVIDIA H800 GPUs) but total training time, GPU hours, or compute budget are not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No multi-seed results reported. All results appear to be from a single training run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Number of experimental runs not stated. Results appear to be single-run."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of hyperparameter search. The chosen hyperparameters (lr=1e-5, 5 epochs, etc.) are presented without justification for how they were selected."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No description of how the final configuration was selected. Multiple training decisions (5 epochs, lr choices) presented without selection justification."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Many comparisons across models and datasets with no statistical tests at all, let alone multiple comparison correction."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Authors evaluate their own system against baselines using their own in-house safety evaluator without acknowledging potential self-comparison bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Claims of matching DeepSeek-R1 (671B) with an 8B model, but no analysis of compute budget differences between the proposed method and baselines."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the safety benchmarks (S-Eval, HarmBench) actually measure real-world safety. The gap between benchmark safety scores and deployment safety is not addressed."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding involved. Models are compared directly via generation."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Evaluation benchmarks (HarmBench 2024, S-Eval, XSTest 2023) predate Qwen3-8B training, creating temporal leakage risk. Not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation prompts or formats are similar to training data. The SAFETY training data uses prompts from the same benchmarks (HarmBench prompts used in EN-SAFETY distillation), creating direct overlap risk."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper uses HarmBench prompts both for training data generation (300 EN-harmbench in Table 1 is listed as evaluation but Section 4.1 describes extracting prompts from safety datasets for distillation) and evaluation, but does not discuss independence."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Single-stage MTC co-training matches two-stage SFT+DPO safety performance (MTC en pos 97.55 vs TPos/DPO en 97.58 avg English score)",
    365       "evidence": "Table 2 shows comparable scores across all 5 English evaluation datasets. Controlled comparison uses same base model and CHAT data.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "The 8B MTC model surpasses DeepSeek-R1 (671B) in safety performance",
    370       "evidence": "Table 2: MTC en pos avg 97.55 vs DeepSeek-R1 avg 87.45 on English benchmarks. However, DSR1 uses think mode only while MTC uses no_think mode, and evaluation uses in-house classifier.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Multi-directional self-distillation produces higher quality pos supervision than single-direction distillation",
    375       "evidence": "Table 2: TPos en (93.03) significantly outperforms SPos en (77.55) on English average. Same training setup except data generation method.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Magic tokens induce a Safety Alignment Margin with structured behavioral separation",
    380       "evidence": "Table 4: MTC en SAM=0.131 vs baselines near zero. PCA visualization in Figure 3 shows well-separated clusters. But SAM is computed differently for MTC (3000 samples across modes) vs baselines (1000 samples, single mode).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Magic tokens enable precise behavioral control at inference time",
    385       "evidence": "Table 3: pos mode yields 95.8% positive responses, rej mode yields 88.6% refusals. Neg mode shows 67.8% negative (with 31.8% positive, largely on safe prompts).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Multi-policy model integrates cross-cultural safety norms in a single model",
    390       "evidence": "Table 2: MTC/MP pos achieves 97.45 English avg and 95.13 Chinese avg, outperforming language-specific variants on Chinese.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "In-house evaluator used for primary results",
    397       "detail": "The primary evaluation metric (Constructive Safety Score) relies on an in-house safety evaluation classifier. While they validate it on 2,540 samples (97.5% accuracy), the classifier may have biases favorable to the in-house model. Extended evaluation with open-source evaluators in Appendix C partially mitigates this."
    398     },
    399     {
    400       "flag": "Company evaluating own product",
    401       "detail": "All authors are from Qiyuan Tech (Qihoo 360), evaluating their own safety framework. The in-house Chinese evaluation datasets (ZH-Red, ZH-Red attack) are not publicly available for independent verification."
    402     },
    403     {
    404       "flag": "Potential training-evaluation data overlap",
    405       "detail": "The paper extracts prompts from safety datasets for self-distillation training. It's unclear whether evaluation prompts from HarmBench, S-Eval, or other benchmarks overlap with training prompts used for distillation."
    406     },
    407     {
    408       "flag": "No variance or multi-run results",
    409       "detail": "All results appear to be from single training runs with no error bars, standard deviations, or multi-seed analysis. This is concerning for a method that claims to reliably embed behavioral separation."
    410     },
    411     {
    412       "flag": "Unfair SAM comparison",
    413       "detail": "SAM for MTC en is computed over 3,000 responses (3 modes × 1,000) while baselines use 1,000 responses. The paper acknowledges baselines have no behavioral switching concept, but computing a separation metric across deliberately different modes vs within a single mode is not a fair comparison — it's measuring the design intent rather than validating it."
    414     },
    415     {
    416       "flag": "Overclaiming scalability",
    417       "detail": "The paper claims 'a scalable, efficient, and highly controllable solution for LLM content safety' but tests only on Qwen3-8B. No evidence of scalability to other models, architectures, or sizes."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    423       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    424       "year": 2024,
    425       "arxiv_id": "2401.05566",
    426       "relevance": "Demonstrates that LLMs can develop persistent deceptive behaviors surviving safety training — directly relevant to AI safety alignment research."
    427     },
    428     {
    429       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    430       "authors": ["Jan Betley", "Daniel Tan", "Niels Warncke"],
    431       "year": 2025,
    432       "arxiv_id": "2502.17424",
    433       "relevance": "Shows fine-tuning can induce emergent broad misalignment across domains, relevant to safety and alignment methodology."
    434     },
    435     {
    436       "title": "Direct preference optimization: Your language model is secretly a reward model",
    437       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    438       "year": 2023,
    439       "relevance": "Core alignment method (DPO) that this paper positions against as a simpler alternative."
    440     },
    441     {
    442       "title": "Constitutional AI: Harmlessness from AI feedback",
    443       "authors": ["Yuntao Bai"],
    444       "year": 2022,
    445       "arxiv_id": "2212.08073",
    446       "relevance": "Foundational RLAIF approach for LLM safety alignment."
    447     },
    448     {
    449       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    450       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin"],
    451       "year": 2024,
    452       "arxiv_id": "2402.04249",
    453       "relevance": "Key safety evaluation benchmark used in this paper's experiments."
    454     },
    455     {
    456       "title": "S-Eval: Towards automated and comprehensive safety evaluation for large language models",
    457       "authors": ["Xiaohan Yuan", "Jinfeng Li"],
    458       "year": 2025,
    459       "doi": "10.1145/3728971",
    460       "relevance": "Primary safety evaluation benchmark providing both base and attack evaluation sets used in this study."
    461     },
    462     {
    463       "title": "AEGIS 2.0: A diverse AI safety dataset and risks taxonomy for alignment of LLM guardrails",
    464       "authors": ["Shaona Ghosh", "Prasoon Varshney"],
    465       "year": 2025,
    466       "doi": "10.18653/v1/2025.naacl-long.306",
    467       "relevance": "Provides the risk taxonomy and safety policy framework used for English safety data distillation."
    468     },
    469     {
    470       "title": "Controllable safety alignment: Inference-time adaptation to diverse safety requirements",
    471       "authors": ["Jingyu Zhang", "Ahmed Elgohary"],
    472       "year": 2025,
    473       "arxiv_id": "2410.08968",
    474       "relevance": "Directly related work on controllable safety alignment at inference time; provides CoSA-Score metric used in evaluation."
    475     },
    476     {
    477       "title": "Persona features control emergent misalignment",
    478       "authors": ["Miles Wang", "Tom Dupré la Tour"],
    479       "year": 2025,
    480       "arxiv_id": "2506.19823",
    481       "relevance": "Identifies internal misaligned persona features using sparse autoencoders, relevant to understanding behavioral control in LLMs."
    482     },
    483     {
    484       "title": "Llama-Nemotron: Efficient reasoning models",
    485       "authors": ["Akhiad Bercovich"],
    486       "year": 2025,
    487       "arxiv_id": "2505.00949",
    488       "relevance": "Source of training data (safety.jsonl, chat.jsonl) and baseline model used in experiments."
    489     },
    490     {
    491       "title": "Safe RLHF: Safe reinforcement learning from human feedback",
    492       "authors": ["Josef Dai", "Xuehai Pan"],
    493       "year": 2023,
    494       "arxiv_id": "2310.12773",
    495       "relevance": "Multi-objective safety alignment approach relevant to comparison with this paper's single-stage method."
    496     }
    497   ]
    498 }

Impressum · Datenschutz