scan-v5.json (28526B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Efficient Switchable Safety Control in LLMs via Magic-Token-Guided Co-Training", 6 "authors": [ 7 "Jianfeng Si", 8 "Lin Sun", 9 "Zhewen Tan", 10 "Xiangzheng Zhang" 11 ], 12 "year": 2025, 13 "venue": "arXiv.org", 14 "arxiv_id": "2508.14904", 15 "doi": "10.48550/arXiv.2508.14904" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": false, 22 "justification": "The abstract prominently claims the 8B model 'notably surpasses DeepSeek-R1 (671B)' but this compares a safety-specialized fine-tune against a general reasoning model in different inference modes (no-think vs think). The claim of 'significantly reducing deployment costs' is asserted without quantification.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Causal claims about multi-directional distillation and magic tokens are supported by controlled ablations: SPos vs TPos vs MTC isolates each design choice, providing adequate evidence for the primary causal assertions.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": false, 34 "justification": "All experiments use a single base model (Qwen3-8B) but the paper makes broad claims about 'scalable safety architectures for LLMs' and 'diverse deployment scenarios' without bounding results to the tested model family.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper does not discuss whether safety improvement may stem from training data quality (AEGIS 2.0) rather than the magic-token mechanism, nor whether the in-house evaluator may favor outputs similar to training distribution.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The Constructive Safety Score is formally defined with a 3-level scoring system; the in-house evaluator is validated at 97.5% accuracy on 2,540 manual reviews; extended evaluation using third-party evaluators (S-Eval, GPT-OSS, Qwen3Guard) is provided in Appendix C.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": false, 54 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions 'mitigating potential misuse of neg modes' as future work, which does not constitute a limitations discussion.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": false, 60 "justification": "No specific threats to validity are discussed, such as potential train-evaluation overlap between AEGIS 2.0 training prompts and S-Eval test sets, single-model generalizability concerns, or in-house evaluator bias.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper uses broad language ('this paradigm opens new avenues for scalable safety architectures') without explicitly stating what results do NOT show — e.g., that only Qwen3-8B was tested or that real-world safety beyond these benchmarks is untested.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding statement is present. Authors are from Qiyuan Tech (Qihoo 360) as indicated by the GitHub repository at github.com/Qihoo360, but no explicit funding acknowledgment is made.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "All four authors list 'Qiyuan Tech, Beijing, China' as their affiliation, and the code repository under Qihoo360's GitHub confirms the institutional context.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "The research is conducted by employees of Qiyuan Tech (Qihoo 360) evaluating their own framework; the organization has a direct interest in the method's reported success.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement is included. There is no declaration of patents, equity, or other financial interests anywhere in the paper.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Magic tokens are defined as randomly generated string identifiers (e.g., 'rfcd9lbo'). The three behavioral modes (pos/neg/rej) are clearly specified. Safety Alignment Margin is formally defined via Silhouette Coefficient in Section 3.3.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "The introduction lists four explicit bullet-point contributions: self-distillation data quality, magic-token co-training for behavioral switching, the SAM metric, and culture-aware multi-policy safety control.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 has five subsections covering SFT/RLHF/DPO paradigms, self-distillation, controllable behavior, deceptive misalignment (sleeper agents), and red-teaming — explicitly positioning the work relative to each.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper provides a GitHub link 'https://github.com/Qihoo360/LLMs-Safety-Control' labeled 'Code & Datasets' in the opening section.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": false, 129 "justification": "Two key evaluation datasets (ZH/Red with 3,000 samples, ZH/Red attack with 988 samples) are described as 'in-house' with no confirmation of public release. The self-generated EN-ALIGN/ZH-ALIGN training datasets are also not confirmed as released.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper specifies 'ModelScope/ms-swift framework on 8 NVIDIA H800 GPUs' but provides no requirements.txt, Dockerfile, or pinned dependency versions.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "Hyperparameters are provided but no step-by-step reproduction instructions exist; readers must infer the training pipeline from Sections 3 and 4.2 without explicit guidance.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "All results in Tables 2 and 5 are single-run point estimates with no confidence intervals or error bars reported.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests are applied to comparative claims such as 'MTC matches SFT+DPO' or 'TPos outperforms SPos'.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Raw performance scores with absolute differences are reported across methods (e.g., TPos en 93.03 vs SPos en 77.55; MTC en pos 97.55 vs TPos/DPO en 97.58), providing context for effect magnitude.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "Evaluation dataset sizes (300–3,000 samples per dataset) are described in Table 1 but no power analysis or sample size justification is provided.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "No standard deviations or variance across training runs or evaluation repeats are reported anywhere in the paper.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "Multiple open-source baselines are included: Qwen3-8B, DSR1-8B, Nemotron-8B, Llama3-8B, Qwen3-32B, and DSR1 (671B) in Table 2.", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "Baselines include Qwen3-32B, DeepSeek-R1-0528, and Llama-3.1 variants — all contemporary 2024-2025 models.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "The comparison of SPos (single-direction) vs TPos (triple-direction) vs TPos/DPO vs MTC constitutes a clear ablation isolating each methodological contribution in Table 2.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper uses in-house Constructive Safety Score plus extended evaluation with Safety Score (S), Helpfulness Score (H), and CoSA-Score (C) using multiple third-party evaluators across 6 benchmarks in Appendix C.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": false, 205 "justification": "No human evaluation of system outputs is conducted. Manual review of 2,540 samples is used only to validate the in-house evaluator's accuracy, not to independently assess model outputs.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "Standard benchmarks (HarmBench, S-Eval, XSTest) serve as held-out evaluation sets; training data is sourced from separate datasets (Llama-Nemotron SFT prompts, AEGIS 2.0 prompts).", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Results are broken down across 5 English datasets (HB, NV, EA, EB, XS) and 4 Chinese datasets representing different risk categories and attack conditions; Table 3 additionally shows behavioral mode distribution per dataset.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Section 4.6 reports that neg mode achieves only 67.8% activation (31.8% produce positive responses), and on XS safe prompts neg mode falls to 50% reliability — incomplete controllability is explicitly acknowledged.", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Table 2 reports MTC/MP rand (random tokens, 90.83 avg en) and MTC/MP no (no system prompt, 93.97 avg en) as degraded variants; Table 4 shows near-zero SAM for baseline models, providing honest comparative context.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Exact identifiers are given: Qwen3-8B as base model; baselines include 'DeepSeek-R1-0528-Qwen3-8B', 'Meta-Llama-3.1-8B-Instruct', 'Llama-3.1-Nemotron-Nano-8B-v1'.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Appendix A provides the full multi-directional self-distillation prompt template (translated from Chinese) and Appendix B provides the helpfulness evaluation prompt.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Section 4.2 reports SFT: 5 epochs, lr=1e-5, warmup ratio=0.01; DPO: 1 epoch, lr=1e-6, β=0.1; inference: temperature=0.9, top_p=0.6, max_tokens=4k.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": true, 254 "answer": true, 255 "justification": "Section 3.2 describes the magic token system in detail: tokens are server-side injected into system prompts, never exposed to API users, with specific example token strings provided (rfcd9lbo, 8v4v5sa3, q787fvif).", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "The self-distillation pipeline is documented in Sections 3.1 and 4.1 including policy sources, JSON output format, sample duplication for think/no-think modes, and per-behavior dataset sizes.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": false, 269 "justification": "ZH/Red (3,000) and ZH/Red attack (988) are described as in-house proprietary datasets. The EN-ALIGN and ZH-ALIGN training datasets generated via self-distillation are not confirmed as publicly released.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "The self-distillation pipeline is documented: prompts from AEGIS 2.0 and Llama-Nemotron are used, responses generated by Qwen3-8B base under structured policy prompts, with sample counts given (EN: 10,977; ZH: 16,521 per behavior).", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants are involved in data collection; data is generated via automated self-distillation from the base model.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "Section 3.1 and 4.1 document the full pipeline: policy specification → structured prompting → multi-directional self-distillation → corpus construction → SFT training, with dataset composition tables.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "The training data cutoff of Qwen3-8B (the base model) is not stated, which matters since standard benchmarks like HarmBench (2024) may have been present in Qwen3's pre-training data.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": false, 301 "justification": "There is no discussion of potential overlap between AEGIS 2.0 prompts used to generate training data (10,977 samples) and evaluation benchmarks that may share similar safety-critical prompt distributions.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": false, 307 "justification": "Qwen3-8B may have seen HarmBench, XSTest, or S-Eval examples during pre-training; this is not acknowledged or addressed in the paper.", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants are involved in the study.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants are involved.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants are involved.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants are involved.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants are involved.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants are involved.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants are involved.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": false, 359 "justification": "Inference settings (temperature, top-p, max tokens) are reported but actual latency or computational cost per inference call is not measured.", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "Hardware is specified (8 NVIDIA H800 GPUs, 80GB) but total training time, GPU hours, or dollar cost are not reported.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Magic-token-guided co-training (single-stage SFT) achieves safety performance comparable to two-stage SFT+DPO", 374 "evidence": "Table 2: MTC en pos scores 97.55 vs TPos/DPO en 97.58 on average English benchmarks, within 0.03 points", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "The 8B model surpasses DeepSeek-R1 (671B) in safety performance", 379 "evidence": "Table 2: MTC en pos avg(en)=97.55 vs DSR1(think)=87.45, but DeepSeek-R1 is a general reasoning model run in think mode while MTC uses no-think mode — not a fair comparison to safety-specialized models", 380 "supported": "weak" 381 }, 382 { 383 "claim": "Multi-directional self-distillation produces significantly better positive supervision than single-direction distillation", 384 "evidence": "Table 2: TPos en (multi-direction pos subset) achieves 93.03 vs SPos en (single-direction) 77.55, a 15.5pp improvement in controlled ablation", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Magic tokens induce structured behavioral separation in the output space, measured by Safety Alignment Margin", 389 "evidence": "Table 4: MTC en achieves SAM=0.131, over 4x higher than Qwen3-8B (0.033); PCA in Figure 3 shows distinct logit clusters per behavioral mode", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "The method is robust to adversarial attacks, declining only 3.8% under attack vs 21.5% average baseline drop", 394 "evidence": "Figure 1 caption and Table 2 EA vs EB score comparisons confirm substantially smaller performance degradation for MTC variants vs open-source baselines", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Multi-policy fusion achieves state-of-the-art performance across both English and Chinese safety benchmarks", 399 "evidence": "Table 2: MTC/MP pos scores 97.45 avg(en) and 95.13 avg(zh), highest among all evaluated models on both language sets", 400 "supported": "moderate" 401 } 402 ], 403 "methodology_tags": [ 404 "benchmark-eval", 405 "empirical" 406 ], 407 "key_findings": "Magic-token-guided co-training embeds three distinct safety behaviors (positive, negative, rejective) into a single Qwen3-8B model via one SFT stage, achieving alignment comparable to two-stage SFT+DPO (97.55 vs 97.58 on English benchmarks). Multi-directional self-distillation substantially improves positive supervision quality over single-direction methods (93.03 vs 77.55). The framework induces measurable behavioral separation in the logit space (SAM=0.131 vs ~0.033 for baselines) and extends to multi-cultural safety policies with competitive performance in both English and Chinese benchmarks. However, negative mode controllability is incomplete (67.8% reliability), all results are single-run point estimates from an in-house evaluator, and the framework is only tested on one model family.", 408 "red_flags": [ 409 { 410 "flag": "In-house evaluator as primary metric", 411 "detail": "The main results in Table 2 rely on a proprietary safety classifier not available for independent verification; the 97.5% accuracy validation uses 2,540 self-generated samples that may not represent distribution shift scenarios." 412 }, 413 { 414 "flag": "Misleading size comparison in abstract", 415 "detail": "The abstract prominently highlights surpassing DeepSeek-R1 (671B) but DeepSeek-R1 is a general reasoning model run in think mode, while MTC runs in no-think mode with safety-specific fine-tuning — not a valid safety-to-safety comparison." 416 }, 417 { 418 "flag": "No variance or confidence intervals", 419 "detail": "All results are single-run point estimates; fine-tuning results are known to vary across random seeds but no variance is reported for any comparison in the paper." 420 }, 421 { 422 "flag": "Potential train-evaluation overlap", 423 "detail": "AEGIS 2.0 prompts are used to generate training data (EN/SAFETY: 10,977 samples) and AEGIS 2.0 is also one of the evaluation benchmarks (NV: 1,964 samples); potential overlap is not discussed." 424 }, 425 { 426 "flag": "Author-defined evaluation metric (SAM)", 427 "detail": "The Safety Alignment Margin is a novel metric invented by the authors to validate their own method, with no external reference for what constitutes a good SAM value or independent validation of the metric's meaning." 428 }, 429 { 430 "flag": "Single model family", 431 "detail": "All experiments use Qwen3-8B as the base model; broad claims about 'scalable safety architectures for LLMs' are not empirically supported beyond this one model family." 432 }, 433 { 434 "flag": "Negative mode security analysis absent", 435 "detail": "The paper acknowledges neg mode misuse risks as future work but provides no security analysis of what happens if the static magic token string is discovered, brute-forced, or leaked from server-side system prompts." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "Training language models to follow instructions with human feedback", 441 "relevance": "Foundational RLHF alignment paper this work extends and compares against as the dominant alignment paradigm" 442 }, 443 { 444 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 445 "relevance": "Key two-stage baseline (SFT+DPO) that the proposed single-stage approach aims to match in safety performance" 446 }, 447 { 448 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 449 "relevance": "Motivates controllable safety behavior; the neg mode is positioned as a transparent alternative to inadvertent sleeper agent backdoors" 450 }, 451 { 452 "title": "Emergent Misalignment: Narrow Finetuning Can Produce Broadly Misaligned LLMs", 453 "relevance": "Related work on unintended misalignment from fine-tuning, contrasted with this paper's claim of intentional, controlled behavioral embedding" 454 }, 455 { 456 "title": "S-Eval: Towards Automated and Comprehensive Safety Evaluation for Large Language Models", 457 "relevance": "Primary evaluation benchmark used across multiple English and Chinese experiments; also provides one of the third-party evaluators in extended evaluation" 458 }, 459 { 460 "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal", 461 "relevance": "Key safety evaluation benchmark for adversarial robustness testing; one of five English evaluation datasets" 462 }, 463 { 464 "title": "AEGIS2.0: A Diverse AI Safety Dataset and Risks Taxonomy for Alignment of LLM Guardrails", 465 "relevance": "Provides the 14-category safety taxonomy and training prompt sources for the English alignment dataset EN/SAFETY" 466 }, 467 { 468 "title": "Controllable Safety Alignment: Inference-time Adaptation to Diverse Safety Requirements", 469 "relevance": "Direct related work on controllable safety alignment; provides the CoSA-Score metric used in the extended evaluation in Appendix C" 470 }, 471 { 472 "title": "LlamaGuard: LLM-based Input-Output Safeguard for Human-AI Conversations", 473 "relevance": "Prior work on LLM-based safety evaluation systems that the approach relates to for scalable safety benchmarking" 474 }, 475 { 476 "title": "XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in Large Language Models", 477 "relevance": "Evaluation benchmark for over-refusal and under-refusal balance; used to analyze neg mode behavior on safe vs unsafe prompts" 478 } 479 ], 480 "engagement_factors": { 481 "practical_relevance": { 482 "score": 2, 483 "justification": "Directly addresses real deployment needs — switchable safety for red-teaming vs user-facing contexts — with code released and a public variant (TinyR1-S-8B) available." 484 }, 485 "surprise_contrarian": { 486 "score": 1, 487 "justification": "The result that single-stage SFT co-training matches two-stage SFT+DPO is mildly surprising, but the core idea of conditional generation via control tokens is not novel." 488 }, 489 "fear_safety": { 490 "score": 2, 491 "justification": "Deliberately embedding a harmful-content generation mode (neg) into a production model raises legitimate AI safety concerns about misuse if magic tokens are leaked or extracted from server-side system prompts." 492 }, 493 "drama_conflict": { 494 "score": 1, 495 "justification": "Mild tension around whether intentionally embedding a harmful capability mode is responsible AI development; the paper addresses this defensively but does not fully resolve the concern." 496 }, 497 "demo_ability": { 498 "score": 2, 499 "justification": "Code and datasets released at github.com/Qihoo360/LLMs-Safety-Control; the public TinyR1-S-8B safety variant is available for direct testing." 500 }, 501 "brand_recognition": { 502 "score": 0, 503 "justification": "Qiyuan Tech / Qihoo 360 is not a prominent AI lab internationally and has low name recognition in the AI safety research community." 504 } 505 }, 506 "hn_data": { 507 "threads": [ 508 { 509 "hn_id": "44963444", 510 "title": "ComputerRL: Scaling Reinforcement Learning for Computer Use Agents", 511 "points": 1, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=44963444", 514 "created_at": "2025-08-20T16:37:58Z" 515 }, 516 { 517 "hn_id": "44116793", 518 "title": "When Models Don't Collapse: On the Consistency of Iterative MLE", 519 "points": 1, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=44116793", 522 "created_at": "2025-05-28T15:06:51Z" 523 }, 524 { 525 "hn_id": "43291999", 526 "title": "Think Inside the JSON: Reinforcement Strategy for Strict LLM Schema Adherence", 527 "points": 1, 528 "comments": 0, 529 "url": "https://news.ycombinator.com/item?id=43291999", 530 "created_at": "2025-03-07T17:19:08Z" 531 }, 532 { 533 "hn_id": "43207715", 534 "title": "GneissWeb: Preparing High Quality Data for LLMs at Scale", 535 "points": 1, 536 "comments": 0, 537 "url": "https://news.ycombinator.com/item?id=43207715", 538 "created_at": "2025-02-28T16:50:52Z" 539 } 540 ], 541 "top_points": 1, 542 "total_points": 4, 543 "total_comments": 0 544 } 545 }