scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35313B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
      6     "authors": [
      7       "DeepSeek-AI"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2501.12948",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract claims: (1) reasoning via pure RL — supported by R1-Zero results (Section 2, Table 3); (2) emergent self-reflection/verification — supported by Figures 9, Table 2; (3) superior to supervised learning — supported by Table 12 R1 vs V3 comparison; (4) distillation to smaller models — supported by Table 15.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Causal claims about RL driving reasoning emergence are supported by the ablation structure: R1-Zero (pure RL, no SFT) develops reasoning behaviors tracked across training steps (Figures 1, 8, 9). The multi-stage pipeline (Table 3) isolates each phase's contribution. Ablation in B.6 tests language consistency reward causally.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper makes broad claims about 'incentivizing reasoning capability in LLMs' but results are specific to DeepSeek-V3-Base architecture. Section G.1 notes smaller models (7B, 16B) failed to show improvements, suggesting the approach may not generalize to all LLMs. The title and abstract are broader than the tested setting.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section A.1 acknowledges that pre-training data may contain OpenAI-model-generated answers on web pages. Section G.1 discusses base model capacity as a critical factor. The paper acknowledges reward hacking as an alternative to genuine improvement (Section B.5). Section 6 discusses prompt sensitivity affecting results.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper frames benchmark scores (AIME, MATH, Codeforces) as evidence of 'reasoning capability' without discussing the gap between benchmark performance and actual reasoning. No discussion of whether solving math competitions is a valid proxy for general reasoning ability, despite the title claiming general 'reasoning capability.'",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6 'Conclusion, Limitation, and Future Work' contains extensive discussion of specific limitations across multiple paragraphs covering structural output, token efficiency, language mixing, prompt sensitivity, software engineering, reward hacking.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 6 identifies specific threats: (1) pure RL depends on reliable reward signals which are hard for tasks like writing; (2) language mixing is caused by the base model's Chinese/English training data; (3) few-shot prompting degrades R1 performance; (4) limited software engineering RL data. Section G.2 details specific failed approaches.",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 6 explicitly states what R1 cannot do: structural output, tool use, token efficiency for simple problems, non-Chinese/English languages. Section G.1 states smaller models failed, bounding the approach to large-scale models. Acknowledges reward hacking as unsolved for complex tasks.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding disclosure or acknowledgments section mentioning funding sources. DeepSeek is a commercial AI company but no explicit funding statement is provided.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All authors are listed under 'DeepSeek-AI' with the email research@deepseek.com. The affiliation is clear — this is a company paper evaluating its own product.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "DeepSeek-AI is a commercial AI company evaluating its own model. The funder (DeepSeek) has a direct financial interest in positive results. No independent funding or external validation mentioned.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests or financial interests statement. DeepSeek is a for-profit company whose commercial product is the subject of the paper. No disclosure of patents, equity, or other financial interests.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "GRPO is formally defined with equations (2.1), reward components (accuracy, format, language consistency) are precisely defined, evaluation protocols are specified per-benchmark in Appendix D, and 'verifiable tasks' is operationalized through specific benchmark categories.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper clearly states three contributions: DeepSeek-R1-Zero (pure RL without SFT), DeepSeek-R1 (multi-stage training pipeline), and distilled smaller models (1.5B–70B); the claim of enabling reasoning without human-labeled trajectories is explicit in the abstract.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section H provides dedicated Related Work on Chain-of-thought Reasoning, Scaling Inference-time Compute, and RL for Reasoning Enhancement, explicitly situating the work relative to OpenAI-o1, STaR, PRM approaches, MCTS, and self-consistency methods.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "Inference code released on GitHub (https://github.com/deepseek-ai/DeepSeek-V3 and https://github.com/deepseek-ai/DeepSeek-R1). Model weights released on HuggingFace. Section I provides download and usage instructions.",
    121           "source": "opus"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Section I states 'We also release SFT and RL data to the public.' The paper uses many standard public benchmarks (MMLU, AIME, Codeforces, etc.) and releases model weights on HuggingFace.",
    127           "source": "opus"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Section I mentions 'pip install -r requirements.txt' but does not provide the actual environment specification or library versions in the paper itself. No Dockerfile or detailed environment setup section.",
    133           "source": "opus"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "Section I provides step-by-step commands for downloading weights, cloning the repo, installing dependencies, converting model weights, and running inference. Hardware requirements (16 H800 GPUs) are specified.",
    139           "source": "opus"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Tables report point estimates only (e.g., '79.8% AIME 2024'). No confidence intervals or error bars are provided despite multiple samples being generated. Bold numbers indicate t-test significance but no CIs.",
    147           "source": "opus"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Tables 3, 8, and 15 state 'Numbers in bold denote the performance is statistically significant (t-test with p < 0.01).' Statistical significance is used for comparative claims.",
    153           "source": "opus"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Results consistently report absolute scores with baselines for context (e.g., AIME from 15.6% initial to 77.9% after RL, Codeforces 96.3 percentile vs 58.7 for V3). Percentage improvements and rating differences are provided throughout Tables 3, 8, 12.",
    159           "source": "opus"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No justification for evaluation sample sizes. Pass@k uses k=64 for AIME, k=16 for MATH, k=8 for LCB (Section D.1), but no power analysis or justification for why these k values are sufficient.",
    165           "source": "opus"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Results are reported as pass@1 averages over k samples but no standard deviations, IQR, or spread measures are provided across runs or seeds. Only point estimates in result tables.",
    171           "source": "opus"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Table 8 compares against Claude-3.5-Sonnet, GPT-4o, DeepSeek-V3, OpenAI-o1-mini, and OpenAI-o1-1217. Table 15 compares distilled models against GPT-4o and Claude-3.5-Sonnet.",
    179           "source": "opus"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines include OpenAI-o1-1217, GPT-4o-0513, Claude-3.5-Sonnet-1022, and QwQ-32B-Preview — all contemporary models at time of publication (January 2025).",
    185           "source": "opus"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Table 3 shows results at each pipeline stage (R1-Zero → Dev1 → Dev2 → Dev3 → R1), isolating the contribution of each training phase. Appendix B.6 provides ablation on language consistency reward. Section F.1 compares distillation vs RL.",
    191           "source": "opus"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Extensive multi-metric evaluation: MMLU, MMLU-Pro, GPQA, AIME, MATH-500, LiveCodeBench, Codeforces rating/percentile, SWE-Bench, Aider, AlpacaEval, ArenaHard, IFEval, SimpleQA, FRAMES, and more (Table 8).",
    197           "source": "opus"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Section D.2 reports ChatbotArena human evaluation results with Elo rankings. Figure 11 shows style-controlled ranking. The platform uses double-blind pairwise comparisons with millions of user votes.",
    203           "source": "opus"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Multiple held-out test sets used. Section E.2 specifically tests on AIME 2025 (released after training) to verify generalization. LiveCodeBench uses problems from Aug 2024-Jan 2025. Decontamination applied (Section D.1).",
    209           "source": "opus"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Figures 15-17 provide per-category breakdowns of MMLU (by subject), MMLU-Pro (by domain), and math competition problems (by category: algebra, geometry, combinatorics, etc.). Table 14 breaks LiveCodeBench by difficulty.",
    215           "source": "opus"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Section 6 discusses specific failure modes: poor structural output, token inefficiency/overthinking, language mixing, prompt sensitivity, limited software engineering improvement. Section G.2 discusses unsuccessful attempts (PRM, MCTS).",
    221           "source": "opus"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section G.2 reports three failed approaches: Process Reward Models, MCTS, and small-model RL. Figure 6 shows reward hacking (reward increases while Codeforces performance decreases). Appendix B.6 shows language consistency reward slightly degrades code performance.",
    227           "source": "opus"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Model versions specified: DeepSeek-V3-Base (671B/37B active MoE), GPT-4o-0513, Claude-3.5-Sonnet-1022, OpenAI-o1-1217, GPT-4-Turbo-1106 for evaluation. Baselines include snapshot dates. Distilled model base models specified in Table 6.",
    235           "source": "opus"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Extensive prompt text provided: training template (Table 1), evaluation prompts for each benchmark (Tables 18-32), reward model prompt (Appendix B.2), risk review prompt (Listing 8), SFT data generation prompts (Listings 1-4), and cold-start prompts (Listings 5-7).",
    241           "source": "opus"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Detailed hyperparameters throughout: learning rate 3e-6, KL coefficient 0.001, temperature 1/0.7, batch size 512, GRPO clip ratio ε=10, max length 32768/65536, reference model update every 400 steps (Sections 2.1, 3.2). Distillation hyperparameters in Table 6. SFT hyperparameters in B.4.2.",
    247           "source": "opus"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used. DeepSeek-R1 is a single model that generates responses directly. The RL training pipeline is infrastructure, not scaffolding.",
    253           "source": "opus"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section B.3 describes RL data composition (Table 4: 26K math, 17K code, 22K STEM, 15K logic, 66K general). Cold-start data creation (B.3.2) describes filtering pipeline. SFT data statistics in Table 5. Decontamination filtering described in D.1. Code test case generation pipeline in B.3.1.",
    259           "source": "opus"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Training data details are described but the actual training data is not fully released. Section I mentions SFT and RL data will be released but at time of paper this appears incomplete ('released at xxx' placeholder). Pre-training data is proprietary web crawl data.",
    267           "source": "opus"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section B.3 describes RL data collection across all categories with counts, sources, and characteristics. Cold-start data pipeline (B.3.2) describes DeepSeek-R1-Zero generation → filtering → DeepSeek-V3 refinement → human verification. Code test case generation in B.3.1.",
    273           "source": "opus"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants in the study. ChatbotArena is an external platform. The paper uses standard benchmarks and model-generated data.",
    279           "source": "opus"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Figure 2 illustrates the full multi-stage pipeline. Section B.3.2 documents cold-start data: prompts → R1-Zero generation (temp=1.0) → correctness filtering (sympy) → format filtering (repetition, language mixing) → V3 refinement → human verification. Table 5 provides final data statistics.",
    285           "source": "opus"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "Section D.1 states 'DeepSeek-V3 base has a knowledge cutoff date of July 2024, predating evaluation benchmarks like CNMO 2024.'",
    293           "source": "opus"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Section D.1 describes comprehensive decontamination: 'filtered out any text segments (including web pages and GitHub files) that contained matching 10-gram sequences from evaluation questions or reference solutions.' Math domain alone identified ~6 million potential texts for removal.",
    299           "source": "opus"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "Section D.1 addresses contamination: n-gram filtering for pre-training and post-training data, temporal separation (math SFT from pre-2023 competitions only), and honest acknowledgment that 'n-gram based decontamination method cannot prevent the paraphrase of testset.' Section E.2 tests on AIME 2025 as fresh validation.",
    305           "source": "opus"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants study. ChatbotArena is an external crowdsourced platform, not a study run by the authors.",
    313           "source": "opus"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants study conducted by the authors.",
    319           "source": "opus"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants study conducted by the authors.",
    325           "source": "opus"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants study conducted by the authors.",
    331           "source": "opus"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants study conducted by the authors.",
    337           "source": "opus"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants study conducted by the authors.",
    343           "source": "opus"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants study conducted by the authors.",
    349           "source": "opus"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference cost, latency, or per-example token consumption reported despite the model generating thousands of thinking tokens per problem. Figure 18 shows token counts but not wall-clock time or dollar cost.",
    357           "source": "opus"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 7 reports total training costs: 101K H800 GPU hours for R1-Zero, 5K for SFT data creation, 41K for R1, totaling 147K GPU hours ($294K at $2/GPU-hour). Hardware specified as 64×8 H800 GPUs, with R1-Zero training taking ~198 hours.",
    363           "source": "opus"
    364         }
    365       },
    366       "experimental_rigor": {
    367         "seed_sensitivity_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No results reported across multiple random seeds. Training curves (Figure 1) show single runs. Evaluation uses pass@k sampling but does not report sensitivity to random seeds in training.",
    371           "source": "opus"
    372         },
    373         "number_of_runs_stated": {
    374           "applies": true,
    375           "answer": true,
    376           "justification": "Section D.1 specifies: k=64 for AIME and GPQA, k=16 for MATH and CodeForces, k=8 for LCB. Pass@1 formula is provided. Reward model trained for a single epoch (B.2). Training is 10,400 steps / 1.6 epochs for R1-Zero.",
    377           "source": "opus"
    378         },
    379         "hyperparameter_search_budget": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "No hyperparameter search budget reported. Section B.4 lists final hyperparameters but does not describe how they were selected or how many configurations were tried. The paper mentions smaller model experiments (7B, 16B, 32B, 230B) but not systematic search.",
    383           "source": "opus"
    384         },
    385         "best_config_selection_justified": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "The paper presents final configurations without explaining how they were selected. For example, GRPO clip ratio ε=10 is described as crucial but the selection process is not documented.",
    389           "source": "opus"
    390         },
    391         "multiple_comparison_correction": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "The paper makes many pairwise comparisons across ~20 benchmarks and 6+ models with t-tests (p<0.01) but no multiple comparison correction (Bonferroni, Holm, etc.) is mentioned.",
    395           "source": "opus"
    396         },
    397         "self_comparison_bias_addressed": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "DeepSeek employees evaluate DeepSeek models against competitors. No acknowledgment of self-comparison bias. Baselines from other companies are evaluated using the authors' framework. No independent evaluation mentioned beyond ChatbotArena.",
    401           "source": "opus"
    402         },
    403         "compute_budget_vs_performance": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "No comparison at matched compute budgets. DeepSeek-R1 uses long chains of thought (8K-18K tokens per problem, Figure 18) vs baselines that use much less compute. The 671B model is compared against models of various sizes without compute normalization.",
    407           "source": "opus"
    408         },
    409         "benchmark_construct_validity": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "No discussion of whether benchmarks like AIME, MATH, or Codeforces actually measure 'reasoning capability' as claimed. The paper equates benchmark performance with reasoning without questioning construct validity.",
    413           "source": "opus"
    414         },
    415         "scaffold_confound_addressed": {
    416           "applies": false,
    417           "answer": false,
    418           "justification": "No scaffolding involved. The model generates responses directly. SWE-Bench uses the agentless framework consistently (Section D.1).",
    419           "source": "opus"
    420         }
    421       },
    422       "data_leakage": {
    423         "temporal_leakage_addressed": {
    424           "applies": true,
    425           "answer": true,
    426           "justification": "Section D.1 states training cutoff is July 2024 and notes benchmarks like CNMO 2024 postdate it. Math SFT prompts sourced exclusively from pre-2023 competitions. Section E.2 tests on AIME 2025 as temporal validation.",
    427           "source": "opus"
    428         },
    429         "feature_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of whether evaluation prompts or framework setups leak information. For example, AlpacaEval and ArenaHard use GPT-4 as judge, and the evaluation prompt format could advantage certain response styles.",
    433           "source": "opus"
    434         },
    435         "non_independence_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether benchmark problems share structural similarities with training data. Acknowledged that web pages may contain 'OpenAI-model-generated answers' (Section A.1) but not analyzed for benchmark-specific overlap beyond n-gram filtering.",
    439           "source": "opus"
    440         },
    441         "leakage_detection_method": {
    442           "applies": true,
    443           "answer": true,
    444           "justification": "Section D.1 describes concrete decontamination: 10-gram sequence matching against evaluation questions and reference solutions, removing ~6 million pre-training texts in math alone. Post-training data undergoes same n-gram protocol. Temporal split (pre-2023 math prompts only).",
    445           "source": "opus"
    446         }
    447       }
    448     }
    449   },
    450   "claims": [
    451     {
    452       "claim": "LLMs can develop advanced reasoning capabilities (self-reflection, verification, long CoT) through pure RL without any supervised fine-tuning on human reasoning demonstrations",
    453       "evidence": "DeepSeek-R1-Zero achieves 77.9% pass@1 on AIME 2024 starting from 15.6%; reflective word frequency increases 5–7× during training (Figure 9); the 'aha moment' demonstrates emergent self-correction behavior (Table 2)",
    454       "supported": "strong"
    455     },
    456     {
    457       "claim": "DeepSeek-R1 achieves performance on par with OpenAI-o1-1217 on mathematical and coding benchmarks",
    458       "evidence": "Table 8: AIME 2024 79.8% (R1) vs 79.2% (o1-1217), MATH-500 97.3% vs 96.4%, Codeforces percentile 96.3% vs 96.6%; cons@64 86.7% on AIME",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Distilling DeepSeek-R1 into a 1.5B parameter model produces a model that outperforms GPT-4o on mathematical benchmarks",
    463       "evidence": "Table 15: DeepSeek-R1-Distill-Qwen-1.5B achieves 28.9% AIME vs GPT-4o 9.3%, and 83.9% MATH-500 vs GPT-4o 74.6%",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Distillation from a strong teacher model substantially outperforms direct RL training on smaller models",
    468       "evidence": "Table 16: DeepSeek-R1-Distill-Qwen-32B (72.6% AIME) vs Qwen2.5-32B-Zero trained with 10K+ steps of large-scale RL (47.0% AIME)",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "Test-time compute scaling with reasoning models fundamentally outperforms majority voting with non-reasoning models even when controlling for total tokens",
    473       "evidence": "Section E.4: Majority voting across 64 samples raises GPT-4o AIME from 9.3% to 13.4%; DeepSeek-R1 pass@1 is 79.8% using ~8,793 thinking tokens vs GPT-4o's 711 average output tokens",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "The SFT cold-start phase causes degradation in reasoning capability relative to pure RL, requiring subsequent RL to recover",
    478       "evidence": "Table 3: Dev1 (after cold-start SFT) drops AIME from 77.9% (R1-Zero) to 59.0%, with reasoning performance only recovering in Dev2 after additional RL",
    479       "supported": "moderate"
    480     }
    481   ],
    482   "methodology_tags": [
    483     "benchmark-eval",
    484     "empirical"
    485   ],
    486   "key_findings": "DeepSeek-R1 demonstrates that advanced reasoning capabilities—including self-reflection, verification, and extended chain-of-thought—can emerge in LLMs through pure reinforcement learning without supervised fine-tuning on human demonstrations, with DeepSeek-R1-Zero achieving 77.9% on AIME 2024 starting from a 15.6% baseline. The final DeepSeek-R1 matches OpenAI-o1-1217 performance across math and coding benchmarks (79.8% AIME, 97.3% MATH-500, 96.3% Codeforces percentile) while being fully open-source. Distillation experiments demonstrate that a 1.5B parameter model distilled from DeepSeek-R1 outperforms GPT-4o on math benchmarks, and that distillation from a strong teacher substantially outperforms direct RL training on smaller models. The paper also provides valuable negative results documenting why Process Reward Models and Monte Carlo Tree Search failed to scale, and empirically shows that cold-start SFT temporarily degrades the reasoning capability that pure RL develops.",
    487   "red_flags": [
    488     {
    489       "flag": "Training data URL placeholder",
    490       "detail": "The paper claims to release SFT and RL training data 'at xxx' — a clear placeholder URL in this version, meaning the 800K SFT and 146K RL datasets cannot be verified as released, undermining reproducibility claims."
    491     },
    492     {
    493       "flag": "AIME 2024 as both validation signal and test benchmark",
    494       "detail": "Figure 1 shows AIME 2024 accuracy tracked throughout training as the primary validation metric, yet Table 8 reports AIME 2024 as a final evaluation benchmark; using the same metric for training monitoring and final evaluation introduces information leakage."
    495     },
    496     {
    497       "flag": "Self-evaluation without independent replication",
    498       "detail": "All benchmark evaluations are conducted by DeepSeek-AI on their own model; no independent third-party replication is described; results for OpenAI-o1-1217 are taken from official reports rather than independent evaluation due to API access limitations in mainland China."
    499     },
    500     {
    501       "flag": "No variance reported for main results",
    502       "detail": "Tables 8, 12, and 15 report only point estimates without standard deviations; the t-test significance bolding requires variance estimates but these are not shown, making it impossible to independently verify statistical claims."
    503     },
    504     {
    505       "flag": "Circular reward model training",
    506       "detail": "The helpful reward model is trained on preference pairs generated by DeepSeek-V3, and DeepSeek-R1 uses DeepSeek-V3-Base as foundation; the preference signal may be biased toward DeepSeek-style outputs."
    507     },
    508     {
    509       "flag": "No financial interests or competing interests statement",
    510       "detail": "A commercial AI company (DeepSeek/High-Flyer) releasing a model with MIT license has obvious commercial interests in the work being perceived as superior to competitors; no conflict of interest statement is provided."
    511     }
    512   ],
    513   "cited_papers": [
    514     {
    515       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    516       "relevance": "Source of the GRPO algorithm, the core RL method used in DeepSeek-R1"
    517     },
    518     {
    519       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    520       "relevance": "Foundational CoT work; DeepSeek-R1 shows these patterns can be learned through RL rather than requiring prompted demonstrations"
    521     },
    522     {
    523       "title": "Training language models to follow instructions with human feedback (InstructGPT/RLHF)",
    524       "relevance": "Established the SFT+RL paradigm that DeepSeek-R1 modifies by removing the SFT requirement for reasoning"
    525     },
    526     {
    527       "title": "Proximal Policy Optimization Algorithms",
    528       "relevance": "RL algorithm that GRPO simplifies; direct comparison in Figure 4 shows GRPO's advantages"
    529     },
    530     {
    531       "title": "STaR: Bootstrapping Reasoning with Reasoning",
    532       "relevance": "Related RL-based approach to learning reasoning through self-generated chains; DeepSeek-R1 extends this with pure outcome-based RL"
    533     },
    534     {
    535       "title": "Let's Verify Step by Step (Process Reward Models)",
    536       "relevance": "PRM approach documented as an unsuccessful attempt in Section G.2 with specific failure reasons"
    537     },
    538     {
    539       "title": "Scaling LLM Test-Time Compute Optimally",
    540       "relevance": "Inference-time scaling paper; DeepSeek-R1 demonstrates training-time RL can achieve what test-time methods cannot"
    541     },
    542     {
    543       "title": "Self-consistency improves chain of thought reasoning in language models",
    544       "relevance": "Majority voting baseline compared in test-time scaling analysis; shown to be substantially weaker than reasoning model approach"
    545     },
    546     {
    547       "title": "DeepSeek-V3 Technical Report",
    548       "relevance": "Base model on which DeepSeek-R1 is trained; architecture, pre-training data, and training details are directly relevant"
    549     },
    550     {
    551       "title": "Scaling Laws for Neural Language Models",
    552       "relevance": "Provides context for Section G.1 finding that RL only works with sufficiently large base models"
    553     }
    554   ],
    555   "engagement_factors": {
    556     "practical_relevance": {
    557       "score": 3,
    558       "justification": "Open-weight models (1.5B-70B) released on HuggingFace that practitioners can immediately download and use for reasoning tasks."
    559     },
    560     "surprise_contrarian": {
    561       "score": 2,
    562       "justification": "Pure RL without SFT producing emergent reasoning behaviors and matching OpenAI-o1 challenges the assumption that supervised fine-tuning on human demonstrations is necessary."
    563     },
    564     "fear_safety": {
    565       "score": 1,
    566       "justification": "Paper acknowledges jailbreak vulnerabilities and enhanced capability for dangerous content but treats safety as secondary to the technical contribution."
    567     },
    568     "drama_conflict": {
    569       "score": 3,
    570       "justification": "A Chinese lab openly challenges OpenAI's flagship reasoning model, claims comparable performance at a fraction of the cost ($294K), and releases everything under MIT license."
    571     },
    572     "demo_ability": {
    573       "score": 3,
    574       "justification": "All model weights from 1.5B to 671B are publicly available on HuggingFace with inference code and instructions, and a hosted API exists."
    575     },
    576     "brand_recognition": {
    577       "score": 3,
    578       "justification": "DeepSeek-R1 became a global news story, directly competing with OpenAI's o1, and is one of the most discussed AI releases of 2025."
    579     }
    580   },
    581   "hn_data": {
    582     "threads": [
    583       {
    584         "hn_id": "42823568",
    585         "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via RL",
    586         "points": 1351,
    587         "comments": 1056,
    588         "url": "https://news.ycombinator.com/item?id=42823568",
    589         "created_at": "2025-01-25T18:39:49Z"
    590       },
    591       {
    592         "hn_id": "42915646",
    593         "title": "Stack Overflow Meets Replication: Security Research Amid Evolving Code Snippets",
    594         "points": 1,
    595         "comments": 0,
    596         "url": "https://news.ycombinator.com/item?id=42915646",
    597         "created_at": "2025-02-03T06:49:46Z"
    598       }
    599     ],
    600     "top_points": 1351,
    601     "total_points": 1352,
    602     "total_comments": 1056
    603   }
    604 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs