ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (27579B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
      6     "authors": [
      7       "DeepSeek-AI"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2501.12948",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All major abstract claims are supported: pure RL enabling reasoning is demonstrated by R1-Zero (Figure 1, Table 8), emergent self-reflection is shown in Figure 9 and Table 2, superior performance to SFT counterparts is confirmed in Table 8, and distillation enabling smaller models is shown in Table 15.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The R1-Zero experiment directly tests pure RL from a base model with no SFT; staged ablations (Dev1–Dev3 in Table 3) isolate component contributions; language consistency reward is ablated (Figure 7); distillation vs. pure RL is compared (Table 16). Multiple causal claims are supported by appropriate ablation designs.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper explicitly states results are strongest for 'verifiable tasks such as mathematics, coding competitions, and STEM fields'; Section 6 limitations acknowledge degraded performance for open-ended writing, software engineering, and non-Chinese/English languages.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper attributes all improvements to RL but does not discuss alternative explanations: whether gains stem from longer output generation, additional training compute, superior base model quality, or reward function design rather than the RL mechanism per se.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper claims 'reasoning capability' but measures benchmark accuracy (AIME, MATH-500, LiveCodeBench) without explicitly discussing the relationship between benchmark performance and the broader construct of reasoning ability.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6 'Conclusion, Limitation, and Future Work' contains a dedicated, multi-paragraph limitations section listing specific capability gaps beyond a single sentence.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Specific threats are identified: prompting sensitivity (few-shot consistently degrades performance), reward hacking documented with example (Figure 6), language mixing in non-Chinese/English queries, and limited RL for software engineering tasks due to long evaluation times.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper explicitly states the approach is limited to tasks with reliable verifiers and notes that 'for complex tasks that cannot be effectively evaluated by a reliable reward model, scaling up pure RL methods remains an open challenge.'",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source or acknowledgment section is present. DeepSeek-AI appears to be corporate self-funded but this is not explicitly stated.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All authors are from DeepSeek-AI, clearly stated as the sole author affiliation with contact email research@deepseek.com.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "DeepSeek-AI employees are evaluating their own model and comparing it favorably against competitors; the funder (DeepSeek) is not independent of the outcome.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement, patents, equity, or financial interests declaration appears anywhere in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key technical terms are formally defined: GRPO is specified with full equations (1–3), reward design (accuracy + format rewards) is explained with formula (4), cold start data is described with examples, and the multi-stage pipeline is diagrammed in Figure 2.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper clearly states its contribution: showing that LLM reasoning can be incentivized through pure RL without human-labeled demonstrations, producing models that match OpenAI-o1 on reasoning benchmarks.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section H provides substantive related work covering chain-of-thought (Wei et al. 2022), inference-time scaling, and RL for reasoning, explicitly contrasting their approach with PRM, MCTS, STaR, and RLHF methods.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "Inference code is released at https://github.com/deepseek-ai/DeepSeek-V3 with torchrun commands and model weights on HuggingFace at https://huggingface.co/deepseek-ai under MIT license.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "All evaluation benchmarks (AIME, MATH-500, MMLU, LiveCodeBench, etc.) are standard public benchmarks used unmodified; training data release is promised but URL is placeholder 'xxx' in the paper.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The inference code example includes 'pip install -r requirements.txt', referencing a requirements file in the GitHub repository; training infrastructure specifies H800 GPUs, vLLM, and DualPipe algorithm.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Inference instructions are provided (torchrun commands), but step-by-step training reproduction is not feasible: training data URL is placeholder 'xxx', and the full RL pipeline requires 147K GPU hours of H800 compute not documented in reproducible form.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "All main result tables report point estimates only; no confidence intervals or error bars are shown despite results being averaged over k=4–64 samples per question.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Table captions state 'Numbers in bold denote the performance is statistically significant (t-test with p < 0.01)', applied to comparative performance claims.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Raw performance numbers are reported throughout enabling direct effect size interpretation (e.g., AIME 79.8% vs 79.2% for o1, MATH-500 97.3% vs 96.4%).",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The paper states k values (k=64 for AIME, k=16 for MATH, k=8 for LCB) but provides no power analysis or formal justification for these specific choices.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No standard deviations or variance across evaluation runs are reported in any of the main results tables, despite averaging over multiple samples.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Comprehensive baselines in Table 8: Claude-3.5-Sonnet-1022, GPT-4o-0513, DeepSeek-V3, OpenAI-o1-mini, OpenAI-o1-1217.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines are frontier models as of early 2025: OpenAI-o1-1217 (December 2024), Claude-3.5-Sonnet-1022, and GPT-4o-0513 — the best available comparators.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Multiple ablations: stage-by-stage comparison (R1-Zero, Dev1–Dev3, R1 in Table 3), language consistency reward ablation (Figure 7), and distillation vs. large-scale RL comparison (Table 16).",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Over 20 benchmarks used spanning math (AIME 2024, MATH-500, CNMO), code (LiveCodeBench, Codeforces, SWE-Bench, Aider), knowledge (MMLU, GPQA Diamond), and instruction following (IFEval, AlpacaEval 2.0, ArenaHard).",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "ChatbotArena crowdsourced pairwise human preference evaluation is used (Figures 11–12), showing DeepSeek-R1 ranking first alongside OpenAI-o1 on the style control setting.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "All evaluation benchmarks are held-out test sets; additionally AIME 2025 (released after training cutoff) is used to assess generalization to genuinely unseen problems.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Per-category breakdowns provided: MMLU by subject (Figure 15–16), math by competition category (Figure 17), LiveCodeBench by difficulty (Table 14), and safety by category (Tables 9–11).",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Failure cases are shown: reward hacking during training (Figure 6), language mixing in multilingual queries, overthinking on simple problems, and Section G.2 explicitly reports failed PRM and MCTS approaches.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section G.2 'Unsuccessful Attempts' dedicates a full section to reporting failures with Process Reward Models (annotation difficulty, reward hacking) and Monte Carlo Tree Search (exponential search space, value model difficulties).",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Base model is DeepSeek-V3-Base; all baseline models include version dates (Claude-Sonnet-3.5-1022, GPT-4o-0513, OpenAI-o1-1217); intermediate checkpoints (Dev1–Dev3) are labeled.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Multiple prompts provided in full: R1-Zero training template (Table 1), reward model prompt (Listing 8), SFT trajectory examples (Listings 5–7), test case generation prompts (Listing 2), and benchmark evaluation prompts (Tables 18–32).",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Full hyperparameters reported in Appendix B.4: learning rate (3e-6), KL coefficient (0.001), clip ratio (ε=10), sampling temperature (1.0), batch size (512), max sequence lengths (32,768–65,536 tokens), per-model distillation learning rates in Table 6.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "RL infrastructure described in detail (Figure 5, Appendix B.1): four distinct modules (rollout via vLLM, inference, rule-based reward, training), expert parallelism strategy, VRAM management, data packing strategy.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Data preprocessing documented: 10-gram decontamination (removing ~6M math texts), cold start data generation pipeline with rejection sampling and human refinement, SFT filtering (language mixing, length, repetition detection), evaluation prompt formats.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "All evaluation benchmarks (AIME, MATH-500, LiveCodeBench, MMLU, etc.) are publicly accessible; model weights are available on HuggingFace enabling independent evaluation replication.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Data collection documented in Appendix B.3 and Table 4: 26K math, 17K code, 22K STEM, 15K logic, 66K general prompts with sources, formats, average lengths, and construction procedures.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants were recruited; standard public benchmarks were used for evaluation.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Full data pipeline documented from collection through SFT: cold start generation (Listings 1–3), rejection sampling, human annotation and verification steps, 800K SFT data statistics (Table 5), and decontamination procedures.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "Appendix D.1 explicitly states 'DeepSeek-V3 base has a knowledge cutoff date of July 2024.'",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Appendix D.1 'Decontamination' explicitly discusses overlap: 10-gram filtering removed ~6M math-related texts; post-training data sourced exclusively from pre-2023 competitions; paper acknowledges n-gram filtering cannot prevent paraphrase contamination.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "AIME 2025 (post-July 2024 cutoff) is used to test generalization to genuinely unseen problems (Table 13), showing 75% solve rate approaching o1's 80%.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human subjects research in this study.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants requiring IRB approval.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "Per-query inference cost and latency are not reported; only training costs appear in Table 7.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 7 provides detailed training costs: R1-Zero 101K H800 GPU hours ($202K), SFT data creation 5K hours ($10K), DeepSeek-R1 41K hours ($82K), total 147K GPU hours ($294K at $2/GPU-hour).",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "DeepSeek-R1-Zero achieves 79.8% pass@1 on AIME 2024 via pure RL without any supervised fine-tuning",
    371       "evidence": "Figure 1 shows training progression from 15.6% to 77.9% on AIME 2024; Table 8 reports final 79.8% pass@1 and 86.7% with cons@64 self-consistency",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "DeepSeek-R1 performance matches OpenAI-o1-1217 on mathematical reasoning benchmarks",
    376       "evidence": "Table 8: DeepSeek-R1 79.8% vs o1 79.2% on AIME 2024, 97.3% vs 96.4% on MATH-500, 78.8% vs unreported on CNMO 2024",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Advanced reasoning behaviors (self-reflection, verification, 'aha moments') emerge spontaneously from RL training without explicit instruction",
    381       "evidence": "Figure 9 shows 5–7x increase in reflective word frequency during training; Table 2 shows the model spontaneously generating 'Wait, wait. Wait. That's an aha moment' to self-correct",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Distilled small models (1.5B–70B) substantially outperform non-reasoning models of comparable or larger size",
    386       "evidence": "Table 15: DeepSeek-R1-Distill-Qwen-1.5B achieves 28.9% AIME 2024 pass@1, surpassing GPT-4o-0513 (9.3%) and Claude-3.5-Sonnet (16.0%)",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Larger base model capacity is a prerequisite for RL-induced reasoning improvements to emerge",
    391       "evidence": "Section G.1 reports that 7B dense and 16B MoE models showed no meaningful AIME improvements under RL, while 32B+ models showed substantial gains",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Distillation from a strong reasoning model outperforms training smaller models with large-scale RL directly",
    396       "evidence": "Table 16: DeepSeek-R1-Distill-Qwen-32B (72.6% AIME) substantially outperforms Qwen2.5-32B-Zero trained with 10K RL steps (47.0% AIME)",
    397       "supported": "strong"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval",
    402     "empirical"
    403   ],
    404   "key_findings": "DeepSeek-R1-Zero demonstrates that pure reinforcement learning applied to a capable base model can autonomously develop sophisticated reasoning behaviors—self-reflection, verification, dynamic strategy adaptation—without any human-annotated demonstrations, reaching 79.8% on AIME 2024 and matching OpenAI-o1. The multi-stage DeepSeek-R1 pipeline (cold start + RL + SFT + RL) addresses readability and language consistency issues while maintaining frontier reasoning performance. Knowledge distillation from R1 into small models (1.5B–70B) produces models that dramatically outperform non-reasoning models of similar size. Two key negative findings: process reward models and MCTS were attempted and abandoned due to reward hacking and scaling difficulties; and smaller base models (7B, 16B MoE) failed to benefit from RL, establishing model scale as a prerequisite.",
    405   "red_flags": [
    406     {
    407       "flag": "Training data URL placeholder",
    408       "detail": "The paper states SFT and RL training data is released 'at xxx' — a literal placeholder, meaning training data was not actually accessible at publication time, preventing training reproduction."
    409     },
    410     {
    411       "flag": "Self-evaluation with no independent replication",
    412       "detail": "DeepSeek-AI employees evaluate their own model; results for OpenAI-o1-1217 are taken from official reports rather than independently measured, making direct comparisons unverifiable."
    413     },
    414     {
    415       "flag": "Severe jailbreak vulnerability",
    416       "detail": "Table 11 shows DeepSeek-R1 without risk control reaches 85.9% unsafe rate under jailbreak attacks — the highest of all tested models. The paper acknowledges enhanced reasoning makes dangerous content more operationally feasible."
    417     },
    418     {
    419       "flag": "No variance reporting",
    420       "detail": "All main benchmark tables report only point estimates; no standard deviations or confidence intervals are shown despite results being averaged over k=4–64 samples per question."
    421     },
    422     {
    423       "flag": "Alternative explanations unaddressed",
    424       "detail": "Improvements attributed solely to RL without considering confounders: longer output generation, additional training compute (147K GPU-hours), or base model quality (DeepSeek-V3-Base already strong) could partially explain gains."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    430       "relevance": "Foundational CoT work that R1 extends via RL; the primary paradigm R1 challenges by showing RL can discover reasoning without human-curated demonstrations"
    431     },
    432     {
    433       "title": "Training language models to follow instructions with human feedback (InstructGPT)",
    434       "relevance": "Establishes the SFT+RLHF paradigm that R1 partially circumvents; key baseline for comparing post-training approaches"
    435     },
    436     {
    437       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    438       "relevance": "Introduces GRPO algorithm used as R1's RL backbone; directly cited as the training algorithm"
    439     },
    440     {
    441       "title": "Let's verify step by step",
    442       "relevance": "Process reward model work that DeepSeek-R1 attempted and abandoned (Section G.2), providing important negative results context for the field"
    443     },
    444     {
    445       "title": "Self-consistency improves chain of thought reasoning in language models",
    446       "relevance": "Self-consistency decoding (cons@16, cons@64) is used in evaluating R1-Zero and boosts AIME accuracy from 79.8% to 86.7%"
    447     },
    448     {
    449       "title": "STaR: Bootstrapping reasoning with reasoning",
    450       "relevance": "Prior RL-based reasoning enhancement that R1 builds upon; key comparison point for showing R1's approach differs by starting from pure RL on base models"
    451     },
    452     {
    453       "title": "DeepSeek-V3 technical report",
    454       "relevance": "DeepSeek-V3-Base is the base model for all R1 variants; understanding the base model is essential for interpreting what RL adds"
    455     },
    456     {
    457       "title": "Scaling LLM test-time compute optimally can be more effective than scaling parameters for reasoning",
    458       "relevance": "Related work on test-time compute scaling; R1's adaptive CoT length is analyzed in relation to this paradigm in Section E.4"
    459     },
    460     {
    461       "title": "Proximal policy optimization algorithms",
    462       "relevance": "PPO is the primary RL baseline compared against GRPO (Figure 4, Appendix A.3); establishing why GRPO is preferred for large-scale training"
    463     },
    464     {
    465       "title": "Language models are few-shot learners (GPT-3)",
    466       "relevance": "Establishes emergent capabilities framework used to contextualize R1's emergent reasoning behaviors"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Model weights freely available on HuggingFace under MIT license; practitioners can immediately use distilled 1.5B–70B models for math, code, and reasoning tasks."
    473     },
    474     "surprise_contrarian": {
    475       "score": 3,
    476       "justification": "Demonstrating that pure RL without SFT produces frontier reasoning challenged the field's consensus that extensive human demonstrations were essential for capable post-training."
    477     },
    478     "fear_safety": {
    479       "score": 2,
    480       "justification": "The paper documents 85.9% unsafe rate under jailbreak attacks without risk control and explicitly notes enhanced reasoning makes dangerous content more operationally feasible."
    481     },
    482     "drama_conflict": {
    483       "score": 3,
    484       "justification": "Directly matches OpenAI-o1 on math benchmarks at $294K training cost under MIT license, challenging the assumption that frontier reasoning models require closed proprietary development."
    485     },
    486     "demo_ability": {
    487       "score": 3,
    488       "justification": "Model weights downloadable from HuggingFace immediately; distilled versions (1.5B–70B) accessible on consumer hardware; official API available."
    489     },
    490     "brand_recognition": {
    491       "score": 3,
    492       "justification": "DeepSeek-R1 became one of the most discussed AI papers of early 2025, generating 1,351 HN points and triggering significant market reactions upon release."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "42823568",
    499         "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via RL",
    500         "points": 1351,
    501         "comments": 1056,
    502         "url": "https://news.ycombinator.com/item?id=42823568",
    503         "created_at": "2025-01-25T18:39:49Z"
    504       },
    505       {
    506         "hn_id": "42915646",
    507         "title": "Stack Overflow Meets Replication: Security Research Amid Evolving Code Snippets",
    508         "points": 1,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=42915646",
    511         "created_at": "2025-02-03T06:49:46Z"
    512       }
    513     ],
    514     "top_points": 1351,
    515     "total_points": 1352,
    516     "total_comments": 1056
    517   }
    518 }

Impressum · Datenschutz