scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31471B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Efficient Switchable Safety Control in LLMs via Magic-Token-Guided Co-Training",
      6     "authors": [
      7       "Jianfeng Si",
      8       "Lin Sun",
      9       "Zhewen Tan",
     10       "Xiangzheng Zhang"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2508.14904",
     15     "doi": "10.48550/arXiv.2508.14904"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims of matching SFT+DPO quality (97.55 vs 97.58, Table 2), surpassing DeepSeek-R1 671B in safety (97.55 vs 87.45), and reduced training complexity (single SFT stage) are all supported by Table 2 results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims like 'multi-directional distillation improves pos quality' are supported by controlled ablations (SPos vs TPos, same data pipeline, same base model). The ablation design holds confounds constant (same CHAT data, same base model).",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The abstract claims a 'scalable, efficient, and highly controllable solution for LLM content safety' but experiments are conducted only on Qwen3-8B. No evidence of scalability to other model families or sizes. Title claims 'LLMs' generally.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No discussion of alternative explanations for the results. Could the safety improvement come from simply training on more safety data (3x triplets)? Could the in-house evaluator favor the in-house model? These confounds are not discussed.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper frames the Constructive Safety Score as measuring 'safety alignment quality' without discussing the gap between automated classifier scores and actual content safety. The 0/1/2 scoring system conflates refusal with constructive engagement but does not discuss this design choice's limitations.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations section. The conclusion mentions 'mitigating potential misuse of neg modes' as future work but does not substantively discuss limitations.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No threats to validity discussed. The reliance on a single base model, single in-house evaluator, and potential evaluator bias are not addressed.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No explicit scope boundaries stated. The paper does not clarify that results are specific to Qwen3-8B or that the approach is untested on other architectures or model sizes.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding information disclosed. Authors are from Qiyuan Tech (a subsidiary of Qihoo 360) but no funding statement is provided.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations clearly stated: all from 'Qiyuan Tech, Beijing, China.' The GitHub repo is under Qihoo360.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Qiyuan Tech/Qihoo 360 is the employer and has commercial interest in demonstrating effective safety controls for their LLM products. The funder is not independent of the outcome.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement. Authors from a commercial AI company evaluating their own safety framework without declaring financial interests.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are explicitly defined: 'magic tokens' are described as randomly generated cryptographic-like strings, 'pos/neg/rej' behaviors are defined with examples, and 'Safety Alignment Margin' is formally defined using the Silhouette Coefficient formula.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Four explicit contributions are listed: self-distillation for behavioral separation, magic-token co-training framework, Safety Alignment Margin metric, and culture-aware multi-policy extension.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Five subsections engage specifically with SFT/RLHF/DPO paradigms, self-distillation, controllable LLMs, sleeper agents, and red-teaming, explicitly articulating how the proposed work differs from each.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "GitHub link provided: https://github.com/Qihoo360/LLMs-Safety-Control. Also a safer variant TinyR1-S-8B is mentioned as released.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "EN-ALIGN and ZH-ALIGN datasets generated via self-distillation are described but no download link is provided. The in-house Chinese evaluation datasets (ZH-Red, ZH-Red attack) are not released. Some evaluation datasets are public (S-Eval, HarmBench, XSTest).",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper mentions ModelScope/ms-swift framework and 8 NVIDIA H800 GPUs but does not provide requirements.txt, library versions, or environment setup details.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link exists but the paper itself does not describe how to replicate the experiments.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 2-5 are reported as point estimates with no confidence intervals or error bars.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper claims 'outperforms' various baselines based on comparing raw numbers without any statistical significance tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "The paper reports percentage improvements with context: e.g., 'baselines experience an average performance drop of 21.5% under attack, ours declines by 3.8% only' (Figure 1), and full baseline-vs-method scores in Table 2 allow effect size computation.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification for evaluation dataset sizes (e.g., 300 HarmBench, 1000 S-Eval samples). No power analysis.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or spread measures reported across runs. Results appear to be single-run numbers.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Table 2 includes multiple baselines: Qwen3-8B, DeepSeek-R1-8B, Nemotron-8B, Llama3-8B, Qwen3-32B, DeepSeek-R1 (671B), plus ablation variants (SPos, TPos, TPos/DPO).",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include recent models: Qwen3-8B/32B, DeepSeek-R1-0528, Llama-3.1-Nemotron-Nano-8B, and comparison with DPO methods. These are contemporary as of 2025.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Table 2 includes systematic ablations: SPos vs TPos (single vs multi-directional distillation), TPos vs TPos/DPO (with/without DPO stage), MTC vs TPos (co-training vs single-behavior). Table 4 ablates SAM across model variants.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Primary metric is Constructive Safety Score. Extended evaluation in Appendix C adds Safety Score (S), Helpfulness Score (H), and CoSA-Score (C). Also SAM metric in Table 4.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Manual review of 2,540 samples to validate the safety evaluation classifier accuracy (94.7%/99.6%/98.9% per-class accuracy, 97.5% overall). This is human evaluation of the evaluation tool, not the system outputs directly, but does involve human judgment on system outputs.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Evaluation uses separate benchmark datasets (HarmBench, S-Eval, XSTest, NVIDIA Aegis 2.0) that are distinct from the training data. Training uses Llama-Nemotron chat data and self-distilled safety data.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 2 provides per-dataset breakdowns across 5 English and 4 Chinese evaluation sets. Table 3 breaks down behavioral controllability per mode per dataset. Table 5 provides per-benchmark S/H/C scores.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Table 3 discusses neg mode producing pos outputs in 31.8% of cases (50% on XSTest), with analysis attributing this to safe prompts where the model 'appropriately avoids introducing risks.' MTC/MP rand and no-token failure modes are also tested.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The neg mode achieves only 67.8% negative response rate (not perfect control). MTC/MP rand and MTC/MP no show degraded performance. TinyR1-S-8B/adh mode shows negative CoSA scores, acknowledged as 'prioritizes usefulness at the cost of safety.'",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Qwen3-8B is specified as the base model. Specific model identifiers given for baselines: DeepSeek-R1-0528-Qwen3-8B, Llama-3.1-Nemotron-Nano-8B-v1, Meta-Llama-3.1-8B-Instruct.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix A provides the full prompt template for multi-directional self-distillation. Appendix B provides the helpfulness evaluation prompt. The magic token strings are provided (rfcd9lbo, 8v4v5sa3, q787fvif).",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Section 4.2 reports: SFT 5 epochs, lr=1e-5, warmup=0.01; DPO 1 epoch, lr=1e-6, β=0.1; inference temperature=0.9, top_p=0.6, max_tokens=4k.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The method is a single-stage SFT training approach with magic tokens in system prompts.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 4.1 describes data construction: EN/CHAT from Llama-Nemotron (39,792 pairs), EN/SAFETY from 11,010 prompts yielding 10,977 per behavior, ZH/CHAT 20,000 pairs, ZH/SAFETY 16,521 per behavior. Think/no-think duplication process described.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw model outputs and evaluation scores are not available for independent verification. Only aggregated scores are reported in tables.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 4.1 describes the data collection: sources for CHAT data, self-distillation process for SAFETY data with specific policy frameworks (AEGIS 2.0, Chinese regulatory taxonomy). Section 4.3 describes evaluation datasets.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants recruited. Evaluation data comes from standard benchmarks and in-house constructed datasets.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline is documented: base model generates triplets under policy-guided prompts → think/no-think duplication → mixing with CHAT data → SFT training. Sample counts provided at each stage (11,010 → 10,977 per behavior).",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The training cutoff of Qwen3-8B base model is not stated. This matters because the model is fine-tuned and evaluated on benchmarks that may overlap with pre-training data.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether Qwen3-8B's pre-training data overlaps with evaluation benchmarks (HarmBench, S-Eval, XSTest).",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "HarmBench (2024), S-Eval, and XSTest were published before Qwen3-8B training, creating contamination risk. This is not discussed.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in the study.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in the study.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in the study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in the study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in the study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in the study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in the study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference cost, latency, or tokens consumed are reported despite claims of 'reducing deployment costs.'",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware is mentioned (8 NVIDIA H800 GPUs) but total training time, GPU hours, or compute budget are not stated.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No multi-seed results reported. All results appear to be from a single training run.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "Number of experimental runs not stated. Results appear to be single-run.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No mention of hyperparameter search. The chosen hyperparameters (lr=1e-5, 5 epochs, etc.) are presented without justification for how they were selected.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "No description of how the final configuration was selected. Multiple training decisions (5 epochs, lr choices) presented without selection justification.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "Many comparisons across models and datasets with no statistical tests at all, let alone multiple comparison correction.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "Authors evaluate their own system against baselines using their own in-house safety evaluator without acknowledging potential self-comparison bias.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "Claims of matching DeepSeek-R1 (671B) with an 8B model, but no analysis of compute budget differences between the proposed method and baselines.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "No discussion of whether the safety benchmarks (S-Eval, HarmBench) actually measure real-world safety. The gap between benchmark safety scores and deployment safety is not addressed.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No scaffolding involved. Models are compared directly via generation.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "Evaluation benchmarks (HarmBench 2024, S-Eval, XSTest 2023) predate Qwen3-8B training, creating temporal leakage risk. Not discussed.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether evaluation prompts or formats are similar to training data. The SAFETY training data uses prompts from the same benchmarks (HarmBench prompts used in EN-SAFETY distillation), creating direct overlap risk.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "The paper uses HarmBench prompts both for training data generation (300 EN-harmbench in Table 1 is listed as evaluation but Section 4.1 describes extracting prompts from safety datasets for distillation) and evaluation, but does not discuss independence.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection or prevention method applied.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Single-stage magic-token co-training matches two-stage SFT+DPO safety performance (97.55 vs 97.58 avg English)",
    456       "evidence": "Table 2 shows MTC en pos = 97.55 vs TPos/DPO en = 97.58 across 5 English evaluation datasets",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "The 8B model surpasses DeepSeek-R1 671B in safety performance",
    461       "evidence": "Table 2 shows MTC en pos avg(en)=97.55 vs DSR1(think) avg(en)=87.45, but DSR1 evaluated in think mode while authors' model uses no-think mode — inherently unfair comparison",
    462       "supported": "weak"
    463     },
    464     {
    465       "claim": "Multi-directional self-distillation substantially improves positive supervision quality over single-direction distillation",
    466       "evidence": "TPos en (93.03 avg English) outperforms SPos en (77.55) with CHAT data held constant, a 15.5pp gap across 5 datasets",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "Magic tokens enable reliable behavioral switching at inference time",
    471       "evidence": "Table 3 shows pos mode 95.8% accuracy and rej mode 88.6% accuracy, but neg mode only 67.8% with 31.8% pos leakage",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Co-training induces a structured Safety Alignment Margin (SAM=0.131) versus near-zero for baselines",
    476       "evidence": "Table 4 shows MTC en SAM=0.131 vs ≤0.051 for all baselines; Figure 3 PCA shows cluster separation",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "Multi-policy extension integrates English and Chinese safety norms, achieving top scores in both languages",
    481       "evidence": "MTC/MP pos achieves 97.45 avg(en) and 95.13 avg(zh), highest among all fine-tuned and open-source variants in Table 2",
    482       "supported": "moderate"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval"
    487   ],
    488   "key_findings": "Magic-token-guided co-training embeds three behavioral modes (positive, negative, rejective) in a single Qwen3-8B model via one SFT stage, achieving safety scores matching two-stage SFT+DPO pipelines (97.55 vs 97.58 avg English) without complex multi-stage training. Multi-directional self-distillation substantially outperforms single-direction distillation (93.03 vs 77.55), suggesting that generating neg and rej responses in contrast improves positive supervision quality. The framework induces a measurable Safety Alignment Margin (SAM=0.131 vs <0.05 for all baselines), providing quantitative evidence of structured behavioral separation in logit space. A multi-policy extension successfully integrates English and Chinese safety norms via control tokens, achieving top scores across both language benchmarks simultaneously.",
    489   "red_flags": [
    490     {
    491       "flag": "Unfair flagship comparison",
    492       "detail": "The headline claim of surpassing DeepSeek-R1 671B compares no-think inference (authors' model) vs think-mode-only inference (DSR1), which is an inherently unfair comparison; the authors' model also uses 83× fewer parameters."
    493     },
    494     {
    495       "flag": "In-house evaluator bias risk",
    496       "detail": "Primary results in Table 2 use an in-house safety evaluator developed by the same lab; although validated at 97.5% accuracy, systematic biases favoring the authors' model on their training distribution cannot be ruled out."
    497     },
    498     {
    499       "flag": "No statistical significance testing",
    500       "detail": "No confidence intervals, error bars, or significance tests are reported for any comparative claims; small differences (e.g., 97.55 vs 97.58) are presented as meaningful without statistical support."
    501     },
    502     {
    503       "flag": "Training-evaluation data overlap",
    504       "detail": "The NV evaluation dataset is AEGIS 2.0 data, and the SAFETY training data is generated using AEGIS 2.0 safety policies as the policy guideline; potential policy-level contamination is not discussed."
    505     },
    506     {
    507       "flag": "No limitations section",
    508       "detail": "The paper contains no dedicated limitations or threats-to-validity section; generalization to other model architectures, sizes beyond 8B, or languages beyond English/Chinese is not addressed."
    509     },
    510     {
    511       "flag": "Neg mode reliability overstated",
    512       "detail": "The neg mode achieves only 67.8% reliability (31.8% false-safe leakage on average, reaching 50% on XSTest), yet is presented as a functional red-teaming tool without adequately flagging this unreliability as a limitation."
    513     },
    514     {
    515       "flag": "Single architecture generalization",
    516       "detail": "All fine-tuned variants use Qwen3-8B base; claims of 'scalability' and broad applicability are not supported by any multi-scale or multi-architecture experiments."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Training language models to follow instructions with human feedback (InstructGPT/RLHF)",
    522       "relevance": "Foundation work on SFT+RLHF alignment that the proposed method aims to simplify and improve upon"
    523     },
    524     {
    525       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    526       "relevance": "Primary baseline alignment method the single-stage co-training claims to match without multi-stage complexity"
    527     },
    528     {
    529       "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training",
    530       "relevance": "Motivating related work on unintended behavioral switching; the paper explicitly contrasts its transparent magic-token approach against covert backdoor triggers"
    531     },
    532     {
    533       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    534       "relevance": "Documents conventional alignment failures under adversarial attacks, motivating the paper's adversarial robustness focus"
    535     },
    536     {
    537       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    538       "relevance": "Primary evaluation benchmark used to measure safety performance across all evaluated models"
    539     },
    540     {
    541       "title": "S-Eval: Towards Automated and Comprehensive Safety Evaluation for Large Language Models",
    542       "relevance": "Key evaluation benchmark for both English and Chinese safety assessment (base and attack variants)"
    543     },
    544     {
    545       "title": "AEGIS2.0: A Diverse AI Safety Dataset and Risks Taxonomy for Alignment of LLM Guardrails",
    546       "relevance": "Provides the 14-category risk taxonomy and safety policy guidance used for English self-distillation, also contributes the NV evaluation dataset"
    547     },
    548     {
    549       "title": "Emergent Misalignment: Narrow Finetuning Can Produce Broadly Misaligned LLMs",
    550       "relevance": "Demonstrates that finetuning on harmful content can cause broad misalignment; the paper argues its co-training framework avoids this via structured margin separation"
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 2,
    556       "justification": "Addresses real deployment needs (red-teaming, safe user interaction, compliance refusal) in a single model, and a safer public variant (TinyR1-S-8B) with code is released."
    557     },
    558     "surprise_contrarian": {
    559       "score": 2,
    560       "justification": "Claiming an 8B model beats DeepSeek-R1 671B on safety, and that intentionally generating harmful content during training improves safe responses, are both counterintuitive results."
    561     },
    562     "fear_safety": {
    563       "score": 2,
    564       "justification": "The paper builds a model with a built-in harmful-content generation mode and explicitly discusses its dual-use risks; the authors withhold the full model (MTC/MP) precisely because of these risks."
    565     },
    566     "drama_conflict": {
    567       "score": 1,
    568       "justification": "The intentional creation of a neg mode for harmful content within a safety framework has mild controversy potential, though the paper's responsible-disclosure framing dampens drama."
    569     },
    570     "demo_ability": {
    571       "score": 2,
    572       "justification": "Code released on GitHub and a public TinyR1-S-8B model allows practitioners to try the safe variant of the framework directly."
    573     },
    574     "brand_recognition": {
    575       "score": 1,
    576       "justification": "Qihoo360/Qiyuan Tech is a recognized Chinese cybersecurity company but not a top-tier AI research institution in the global AI community."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [
    581       {
    582         "hn_id": "44963444",
    583         "title": "ComputerRL: Scaling Reinforcement Learning for Computer Use Agents",
    584         "points": 1,
    585         "comments": 0,
    586         "url": "https://news.ycombinator.com/item?id=44963444",
    587         "created_at": "2025-08-20T16:37:58Z"
    588       },
    589       {
    590         "hn_id": "44116793",
    591         "title": "When Models Don't Collapse: On the Consistency of Iterative MLE",
    592         "points": 1,
    593         "comments": 0,
    594         "url": "https://news.ycombinator.com/item?id=44116793",
    595         "created_at": "2025-05-28T15:06:51Z"
    596       },
    597       {
    598         "hn_id": "43291999",
    599         "title": "Think Inside the JSON: Reinforcement Strategy for Strict LLM Schema Adherence",
    600         "points": 1,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=43291999",
    603         "created_at": "2025-03-07T17:19:08Z"
    604       },
    605       {
    606         "hn_id": "43207715",
    607         "title": "GneissWeb: Preparing High Quality Data for LLMs at Scale",
    608         "points": 1,
    609         "comments": 0,
    610         "url": "https://news.ycombinator.com/item?id=43207715",
    611         "created_at": "2025-02-28T16:50:52Z"
    612       }
    613     ],
    614     "top_points": 1,
    615     "total_points": 4,
    616     "total_comments": 0
    617   }
    618 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs