ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (28526B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Efficient Switchable Safety Control in LLMs via Magic-Token-Guided Co-Training",
      6     "authors": [
      7       "Jianfeng Si",
      8       "Lin Sun",
      9       "Zhewen Tan",
     10       "Xiangzheng Zhang"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2508.14904",
     15     "doi": "10.48550/arXiv.2508.14904"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract prominently claims the 8B model 'notably surpasses DeepSeek-R1 (671B)' but this compares a safety-specialized fine-tune against a general reasoning model in different inference modes (no-think vs think). The claim of 'significantly reducing deployment costs' is asserted without quantification.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims about multi-directional distillation and magic tokens are supported by controlled ablations: SPos vs TPos vs MTC isolates each design choice, providing adequate evidence for the primary causal assertions.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "All experiments use a single base model (Qwen3-8B) but the paper makes broad claims about 'scalable safety architectures for LLMs' and 'diverse deployment scenarios' without bounding results to the tested model family.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss whether safety improvement may stem from training data quality (AEGIS 2.0) rather than the magic-token mechanism, nor whether the in-house evaluator may favor outputs similar to training distribution.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The Constructive Safety Score is formally defined with a 3-level scoring system; the in-house evaluator is validated at 97.5% accuracy on 2,540 manual reviews; extended evaluation using third-party evaluators (S-Eval, GPT-OSS, Qwen3Guard) is provided in Appendix C.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions 'mitigating potential misuse of neg modes' as future work, which does not constitute a limitations discussion.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No specific threats to validity are discussed, such as potential train-evaluation overlap between AEGIS 2.0 training prompts and S-Eval test sets, single-model generalizability concerns, or in-house evaluator bias.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper uses broad language ('this paradigm opens new avenues for scalable safety architectures') without explicitly stating what results do NOT show — e.g., that only Qwen3-8B was tested or that real-world safety beyond these benchmarks is untested.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding statement is present. Authors are from Qiyuan Tech (Qihoo 360) as indicated by the GitHub repository at github.com/Qihoo360, but no explicit funding acknowledgment is made.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors list 'Qiyuan Tech, Beijing, China' as their affiliation, and the code repository under Qihoo360's GitHub confirms the institutional context.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The research is conducted by employees of Qiyuan Tech (Qihoo 360) evaluating their own framework; the organization has a direct interest in the method's reported success.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement is included. There is no declaration of patents, equity, or other financial interests anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Magic tokens are defined as randomly generated string identifiers (e.g., 'rfcd9lbo'). The three behavioral modes (pos/neg/rej) are clearly specified. Safety Alignment Margin is formally defined via Silhouette Coefficient in Section 3.3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The introduction lists four explicit bullet-point contributions: self-distillation data quality, magic-token co-training for behavioral switching, the SAM metric, and culture-aware multi-policy safety control.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 has five subsections covering SFT/RLHF/DPO paradigms, self-distillation, controllable behavior, deceptive misalignment (sleeper agents), and red-teaming — explicitly positioning the work relative to each.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper provides a GitHub link 'https://github.com/Qihoo360/LLMs-Safety-Control' labeled 'Code & Datasets' in the opening section.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Two key evaluation datasets (ZH/Red with 3,000 samples, ZH/Red attack with 988 samples) are described as 'in-house' with no confirmation of public release. The self-generated EN-ALIGN/ZH-ALIGN training datasets are also not confirmed as released.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper specifies 'ModelScope/ms-swift framework on 8 NVIDIA H800 GPUs' but provides no requirements.txt, Dockerfile, or pinned dependency versions.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Hyperparameters are provided but no step-by-step reproduction instructions exist; readers must infer the training pipeline from Sections 3 and 4.2 without explicit guidance.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 2 and 5 are single-run point estimates with no confidence intervals or error bars reported.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to comparative claims such as 'MTC matches SFT+DPO' or 'TPos outperforms SPos'.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Raw performance scores with absolute differences are reported across methods (e.g., TPos en 93.03 vs SPos en 77.55; MTC en pos 97.55 vs TPos/DPO en 97.58), providing context for effect magnitude.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Evaluation dataset sizes (300–3,000 samples per dataset) are described in Table 1 but no power analysis or sample size justification is provided.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No standard deviations or variance across training runs or evaluation repeats are reported anywhere in the paper.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Multiple open-source baselines are included: Qwen3-8B, DSR1-8B, Nemotron-8B, Llama3-8B, Qwen3-32B, and DSR1 (671B) in Table 2.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include Qwen3-32B, DeepSeek-R1-0528, and Llama-3.1 variants — all contemporary 2024-2025 models.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "The comparison of SPos (single-direction) vs TPos (triple-direction) vs TPos/DPO vs MTC constitutes a clear ablation isolating each methodological contribution in Table 2.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper uses in-house Constructive Safety Score plus extended evaluation with Safety Score (S), Helpfulness Score (H), and CoSA-Score (C) using multiple third-party evaluators across 6 benchmarks in Appendix C.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation of system outputs is conducted. Manual review of 2,540 samples is used only to validate the in-house evaluator's accuracy, not to independently assess model outputs.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Standard benchmarks (HarmBench, S-Eval, XSTest) serve as held-out evaluation sets; training data is sourced from separate datasets (Llama-Nemotron SFT prompts, AEGIS 2.0 prompts).",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down across 5 English datasets (HB, NV, EA, EB, XS) and 4 Chinese datasets representing different risk categories and attack conditions; Table 3 additionally shows behavioral mode distribution per dataset.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 4.6 reports that neg mode achieves only 67.8% activation (31.8% produce positive responses), and on XS safe prompts neg mode falls to 50% reliability — incomplete controllability is explicitly acknowledged.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Table 2 reports MTC/MP rand (random tokens, 90.83 avg en) and MTC/MP no (no system prompt, 93.97 avg en) as degraded variants; Table 4 shows near-zero SAM for baseline models, providing honest comparative context.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Exact identifiers are given: Qwen3-8B as base model; baselines include 'DeepSeek-R1-0528-Qwen3-8B', 'Meta-Llama-3.1-8B-Instruct', 'Llama-3.1-Nemotron-Nano-8B-v1'.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix A provides the full multi-directional self-distillation prompt template (translated from Chinese) and Appendix B provides the helpfulness evaluation prompt.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Section 4.2 reports SFT: 5 epochs, lr=1e-5, warmup ratio=0.01; DPO: 1 epoch, lr=1e-6, β=0.1; inference: temperature=0.9, top_p=0.6, max_tokens=4k.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Section 3.2 describes the magic token system in detail: tokens are server-side injected into system prompts, never exposed to API users, with specific example token strings provided (rfcd9lbo, 8v4v5sa3, q787fvif).",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The self-distillation pipeline is documented in Sections 3.1 and 4.1 including policy sources, JSON output format, sample duplication for think/no-think modes, and per-behavior dataset sizes.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "ZH/Red (3,000) and ZH/Red attack (988) are described as in-house proprietary datasets. The EN-ALIGN and ZH-ALIGN training datasets generated via self-distillation are not confirmed as publicly released.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The self-distillation pipeline is documented: prompts from AEGIS 2.0 and Llama-Nemotron are used, responses generated by Qwen3-8B base under structured policy prompts, with sample counts given (EN: 10,977; ZH: 16,521 per behavior).",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants are involved in data collection; data is generated via automated self-distillation from the base model.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Section 3.1 and 4.1 document the full pipeline: policy specification → structured prompting → multi-directional self-distillation → corpus construction → SFT training, with dataset composition tables.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The training data cutoff of Qwen3-8B (the base model) is not stated, which matters since standard benchmarks like HarmBench (2024) may have been present in Qwen3's pre-training data.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "There is no discussion of potential overlap between AEGIS 2.0 prompts used to generate training data (10,977 samples) and evaluation benchmarks that may share similar safety-critical prompt distributions.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Qwen3-8B may have seen HarmBench, XSTest, or S-Eval examples during pre-training; this is not acknowledged or addressed in the paper.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants are involved in the study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants are involved.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants are involved.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants are involved.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants are involved.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants are involved.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants are involved.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Inference settings (temperature, top-p, max tokens) are reported but actual latency or computational cost per inference call is not measured.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware is specified (8 NVIDIA H800 GPUs, 80GB) but total training time, GPU hours, or dollar cost are not reported.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Magic-token-guided co-training (single-stage SFT) achieves safety performance comparable to two-stage SFT+DPO",
    374       "evidence": "Table 2: MTC en pos scores 97.55 vs TPos/DPO en 97.58 on average English benchmarks, within 0.03 points",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The 8B model surpasses DeepSeek-R1 (671B) in safety performance",
    379       "evidence": "Table 2: MTC en pos avg(en)=97.55 vs DSR1(think)=87.45, but DeepSeek-R1 is a general reasoning model run in think mode while MTC uses no-think mode — not a fair comparison to safety-specialized models",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Multi-directional self-distillation produces significantly better positive supervision than single-direction distillation",
    384       "evidence": "Table 2: TPos en (multi-direction pos subset) achieves 93.03 vs SPos en (single-direction) 77.55, a 15.5pp improvement in controlled ablation",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Magic tokens induce structured behavioral separation in the output space, measured by Safety Alignment Margin",
    389       "evidence": "Table 4: MTC en achieves SAM=0.131, over 4x higher than Qwen3-8B (0.033); PCA in Figure 3 shows distinct logit clusters per behavioral mode",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The method is robust to adversarial attacks, declining only 3.8% under attack vs 21.5% average baseline drop",
    394       "evidence": "Figure 1 caption and Table 2 EA vs EB score comparisons confirm substantially smaller performance degradation for MTC variants vs open-source baselines",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Multi-policy fusion achieves state-of-the-art performance across both English and Chinese safety benchmarks",
    399       "evidence": "Table 2: MTC/MP pos scores 97.45 avg(en) and 95.13 avg(zh), highest among all evaluated models on both language sets",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "empirical"
    406   ],
    407   "key_findings": "Magic-token-guided co-training embeds three distinct safety behaviors (positive, negative, rejective) into a single Qwen3-8B model via one SFT stage, achieving alignment comparable to two-stage SFT+DPO (97.55 vs 97.58 on English benchmarks). Multi-directional self-distillation substantially improves positive supervision quality over single-direction methods (93.03 vs 77.55). The framework induces measurable behavioral separation in the logit space (SAM=0.131 vs ~0.033 for baselines) and extends to multi-cultural safety policies with competitive performance in both English and Chinese benchmarks. However, negative mode controllability is incomplete (67.8% reliability), all results are single-run point estimates from an in-house evaluator, and the framework is only tested on one model family.",
    408   "red_flags": [
    409     {
    410       "flag": "In-house evaluator as primary metric",
    411       "detail": "The main results in Table 2 rely on a proprietary safety classifier not available for independent verification; the 97.5% accuracy validation uses 2,540 self-generated samples that may not represent distribution shift scenarios."
    412     },
    413     {
    414       "flag": "Misleading size comparison in abstract",
    415       "detail": "The abstract prominently highlights surpassing DeepSeek-R1 (671B) but DeepSeek-R1 is a general reasoning model run in think mode, while MTC runs in no-think mode with safety-specific fine-tuning — not a valid safety-to-safety comparison."
    416     },
    417     {
    418       "flag": "No variance or confidence intervals",
    419       "detail": "All results are single-run point estimates; fine-tuning results are known to vary across random seeds but no variance is reported for any comparison in the paper."
    420     },
    421     {
    422       "flag": "Potential train-evaluation overlap",
    423       "detail": "AEGIS 2.0 prompts are used to generate training data (EN/SAFETY: 10,977 samples) and AEGIS 2.0 is also one of the evaluation benchmarks (NV: 1,964 samples); potential overlap is not discussed."
    424     },
    425     {
    426       "flag": "Author-defined evaluation metric (SAM)",
    427       "detail": "The Safety Alignment Margin is a novel metric invented by the authors to validate their own method, with no external reference for what constitutes a good SAM value or independent validation of the metric's meaning."
    428     },
    429     {
    430       "flag": "Single model family",
    431       "detail": "All experiments use Qwen3-8B as the base model; broad claims about 'scalable safety architectures for LLMs' are not empirically supported beyond this one model family."
    432     },
    433     {
    434       "flag": "Negative mode security analysis absent",
    435       "detail": "The paper acknowledges neg mode misuse risks as future work but provides no security analysis of what happens if the static magic token string is discovered, brute-forced, or leaked from server-side system prompts."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "Training language models to follow instructions with human feedback",
    441       "relevance": "Foundational RLHF alignment paper this work extends and compares against as the dominant alignment paradigm"
    442     },
    443     {
    444       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    445       "relevance": "Key two-stage baseline (SFT+DPO) that the proposed single-stage approach aims to match in safety performance"
    446     },
    447     {
    448       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    449       "relevance": "Motivates controllable safety behavior; the neg mode is positioned as a transparent alternative to inadvertent sleeper agent backdoors"
    450     },
    451     {
    452       "title": "Emergent Misalignment: Narrow Finetuning Can Produce Broadly Misaligned LLMs",
    453       "relevance": "Related work on unintended misalignment from fine-tuning, contrasted with this paper's claim of intentional, controlled behavioral embedding"
    454     },
    455     {
    456       "title": "S-Eval: Towards Automated and Comprehensive Safety Evaluation for Large Language Models",
    457       "relevance": "Primary evaluation benchmark used across multiple English and Chinese experiments; also provides one of the third-party evaluators in extended evaluation"
    458     },
    459     {
    460       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    461       "relevance": "Key safety evaluation benchmark for adversarial robustness testing; one of five English evaluation datasets"
    462     },
    463     {
    464       "title": "AEGIS2.0: A Diverse AI Safety Dataset and Risks Taxonomy for Alignment of LLM Guardrails",
    465       "relevance": "Provides the 14-category safety taxonomy and training prompt sources for the English alignment dataset EN/SAFETY"
    466     },
    467     {
    468       "title": "Controllable Safety Alignment: Inference-time Adaptation to Diverse Safety Requirements",
    469       "relevance": "Direct related work on controllable safety alignment; provides the CoSA-Score metric used in the extended evaluation in Appendix C"
    470     },
    471     {
    472       "title": "LlamaGuard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    473       "relevance": "Prior work on LLM-based safety evaluation systems that the approach relates to for scalable safety benchmarking"
    474     },
    475     {
    476       "title": "XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in Large Language Models",
    477       "relevance": "Evaluation benchmark for over-refusal and under-refusal balance; used to analyze neg mode behavior on safe vs unsafe prompts"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 2,
    483       "justification": "Directly addresses real deployment needs — switchable safety for red-teaming vs user-facing contexts — with code released and a public variant (TinyR1-S-8B) available."
    484     },
    485     "surprise_contrarian": {
    486       "score": 1,
    487       "justification": "The result that single-stage SFT co-training matches two-stage SFT+DPO is mildly surprising, but the core idea of conditional generation via control tokens is not novel."
    488     },
    489     "fear_safety": {
    490       "score": 2,
    491       "justification": "Deliberately embedding a harmful-content generation mode (neg) into a production model raises legitimate AI safety concerns about misuse if magic tokens are leaked or extracted from server-side system prompts."
    492     },
    493     "drama_conflict": {
    494       "score": 1,
    495       "justification": "Mild tension around whether intentionally embedding a harmful capability mode is responsible AI development; the paper addresses this defensively but does not fully resolve the concern."
    496     },
    497     "demo_ability": {
    498       "score": 2,
    499       "justification": "Code and datasets released at github.com/Qihoo360/LLMs-Safety-Control; the public TinyR1-S-8B safety variant is available for direct testing."
    500     },
    501     "brand_recognition": {
    502       "score": 0,
    503       "justification": "Qiyuan Tech / Qihoo 360 is not a prominent AI lab internationally and has low name recognition in the AI safety research community."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "44963444",
    510         "title": "ComputerRL: Scaling Reinforcement Learning for Computer Use Agents",
    511         "points": 1,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=44963444",
    514         "created_at": "2025-08-20T16:37:58Z"
    515       },
    516       {
    517         "hn_id": "44116793",
    518         "title": "When Models Don't Collapse: On the Consistency of Iterative MLE",
    519         "points": 1,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=44116793",
    522         "created_at": "2025-05-28T15:06:51Z"
    523       },
    524       {
    525         "hn_id": "43291999",
    526         "title": "Think Inside the JSON: Reinforcement Strategy for Strict LLM Schema Adherence",
    527         "points": 1,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=43291999",
    530         "created_at": "2025-03-07T17:19:08Z"
    531       },
    532       {
    533         "hn_id": "43207715",
    534         "title": "GneissWeb: Preparing High Quality Data for LLMs at Scale",
    535         "points": 1,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=43207715",
    538         "created_at": "2025-02-28T16:50:52Z"
    539       }
    540     ],
    541     "top_points": 1,
    542     "total_points": 4,
    543     "total_comments": 0
    544   }
    545 }

Impressum · Datenschutz