scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31505B)
      1 {
      2   "paper": {
      3     "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
      4     "authors": [
      5       "DeepSeek-AI"
      6     ],
      7     "year": 2025,
      8     "venue": "arXiv",
      9     "arxiv_id": "2501.12948"
     10   },
     11   "scan_version": 3,
     12   "active_modules": [
     13     "experimental_rigor",
     14     "data_leakage"
     15   ],
     16   "methodology_tags": [
     17     "benchmark-eval"
     18   ],
     19   "key_findings": "DeepSeek-R1-Zero demonstrates that pure RL without supervised fine-tuning can incentivize sophisticated reasoning behaviors (self-reflection, verification) in LLMs, achieving 79.8% on AIME 2024 and 97.3% on MATH-500, matching OpenAI-o1. The multi-stage pipeline (cold-start SFT → RL → rejection sampling SFT → RL) produces DeepSeek-R1, which ranks alongside top closed-source models on ChatbotArena. Distilled smaller models (1.5B-70B) outperform GPT-4o and Claude-3.5-Sonnet on math benchmarks, and distillation outperforms RL alone on smaller architectures.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Inference code released on GitHub (https://github.com/deepseek-ai/DeepSeek-V3 and https://github.com/deepseek-ai/DeepSeek-R1). Model weights released on HuggingFace. Section I provides download and usage instructions."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Section I states 'We also release SFT and RL data to the public.' The paper uses many standard public benchmarks (MMLU, AIME, Codeforces, etc.) and releases model weights on HuggingFace."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Section I mentions 'pip install -r requirements.txt' but does not provide the actual environment specification or library versions in the paper itself. No Dockerfile or detailed environment setup section."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section I provides step-by-step commands for downloading weights, cloning the repo, installing dependencies, converting model weights, and running inference. Hardware requirements (16 H800 GPUs) are specified."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Tables report point estimates only (e.g., '79.8% AIME 2024'). No confidence intervals or error bars are provided despite multiple samples being generated. Bold numbers indicate t-test significance but no CIs."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Tables 3, 8, and 15 state 'Numbers in bold denote the performance is statistically significant (t-test with p < 0.01).' Statistical significance is used for comparative claims."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Results consistently report absolute scores with baselines for context (e.g., AIME from 15.6% initial to 77.9% after RL, Codeforces 96.3 percentile vs 58.7 for V3). Percentage improvements and rating differences are provided throughout Tables 3, 8, 12."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification for evaluation sample sizes. Pass@k uses k=64 for AIME, k=16 for MATH, k=8 for LCB (Section D.1), but no power analysis or justification for why these k values are sufficient."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Results are reported as pass@1 averages over k samples but no standard deviations, IQR, or spread measures are provided across runs or seeds. Only point estimates in result tables."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Table 8 compares against Claude-3.5-Sonnet, GPT-4o, DeepSeek-V3, OpenAI-o1-mini, and OpenAI-o1-1217. Table 15 compares distilled models against GPT-4o and Claude-3.5-Sonnet."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include OpenAI-o1-1217, GPT-4o-0513, Claude-3.5-Sonnet-1022, and QwQ-32B-Preview — all contemporary models at time of publication (January 2025)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table 3 shows results at each pipeline stage (R1-Zero → Dev1 → Dev2 → Dev3 → R1), isolating the contribution of each training phase. Appendix B.6 provides ablation on language consistency reward. Section F.1 compares distillation vs RL."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Extensive multi-metric evaluation: MMLU, MMLU-Pro, GPQA, AIME, MATH-500, LiveCodeBench, Codeforces rating/percentile, SWE-Bench, Aider, AlpacaEval, ArenaHard, IFEval, SimpleQA, FRAMES, and more (Table 8)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section D.2 reports ChatbotArena human evaluation results with Elo rankings. Figure 11 shows style-controlled ranking. The platform uses double-blind pairwise comparisons with millions of user votes."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Multiple held-out test sets used. Section E.2 specifically tests on AIME 2025 (released after training) to verify generalization. LiveCodeBench uses problems from Aug 2024-Jan 2025. Decontamination applied (Section D.1)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Figures 15-17 provide per-category breakdowns of MMLU (by subject), MMLU-Pro (by domain), and math competition problems (by category: algebra, geometry, combinatorics, etc.). Table 14 breaks LiveCodeBench by difficulty."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 discusses specific failure modes: poor structural output, token inefficiency/overthinking, language mixing, prompt sensitivity, limited software engineering improvement. Section G.2 discusses unsuccessful attempts (PRM, MCTS)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section G.2 reports three failed approaches: Process Reward Models, MCTS, and small-model RL. Figure 6 shows reward hacking (reward increases while Codeforces performance decreases). Appendix B.6 shows language consistency reward slightly degrades code performance."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims: (1) reasoning via pure RL — supported by R1-Zero results (Section 2, Table 3); (2) emergent self-reflection/verification — supported by Figures 9, Table 2; (3) superior to supervised learning — supported by Table 12 R1 vs V3 comparison; (4) distillation to smaller models — supported by Table 15."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims about RL driving reasoning emergence are supported by the ablation structure: R1-Zero (pure RL, no SFT) develops reasoning behaviors tracked across training steps (Figures 1, 8, 9). The multi-stage pipeline (Table 3) isolates each phase's contribution. Ablation in B.6 tests language consistency reward causally."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper makes broad claims about 'incentivizing reasoning capability in LLMs' but results are specific to DeepSeek-V3-Base architecture. Section G.1 notes smaller models (7B, 16B) failed to show improvements, suggesting the approach may not generalize to all LLMs. The title and abstract are broader than the tested setting."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section A.1 acknowledges that pre-training data may contain OpenAI-model-generated answers on web pages. Section G.1 discusses base model capacity as a critical factor. The paper acknowledges reward hacking as an alternative to genuine improvement (Section B.5). Section 6 discusses prompt sensitivity affecting results."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper frames benchmark scores (AIME, MATH, Codeforces) as evidence of 'reasoning capability' without discussing the gap between benchmark performance and actual reasoning. No discussion of whether solving math competitions is a valid proxy for general reasoning ability, despite the title claiming general 'reasoning capability.'"
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Model versions specified: DeepSeek-V3-Base (671B/37B active MoE), GPT-4o-0513, Claude-3.5-Sonnet-1022, OpenAI-o1-1217, GPT-4-Turbo-1106 for evaluation. Baselines include snapshot dates. Distilled model base models specified in Table 6."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Extensive prompt text provided: training template (Table 1), evaluation prompts for each benchmark (Tables 18-32), reward model prompt (Appendix B.2), risk review prompt (Listing 8), SFT data generation prompts (Listings 1-4), and cold-start prompts (Listings 5-7)."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Detailed hyperparameters throughout: learning rate 3e-6, KL coefficient 0.001, temperature 1/0.7, batch size 512, GRPO clip ratio ε=10, max length 32768/65536, reference model update every 400 steps (Sections 2.1, 3.2). Distillation hyperparameters in Table 6. SFT hyperparameters in B.4.2."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. DeepSeek-R1 is a single model that generates responses directly. The RL training pipeline is infrastructure, not scaffolding."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section B.3 describes RL data composition (Table 4: 26K math, 17K code, 22K STEM, 15K logic, 66K general). Cold-start data creation (B.3.2) describes filtering pipeline. SFT data statistics in Table 5. Decontamination filtering described in D.1. Code test case generation pipeline in B.3.1."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Conclusion, Limitation, and Future Work' contains extensive discussion of specific limitations across multiple paragraphs covering structural output, token efficiency, language mixing, prompt sensitivity, software engineering, reward hacking."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 identifies specific threats: (1) pure RL depends on reliable reward signals which are hard for tasks like writing; (2) language mixing is caused by the base model's Chinese/English training data; (3) few-shot prompting degrades R1 performance; (4) limited software engineering RL data. Section G.2 details specific failed approaches."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 explicitly states what R1 cannot do: structural output, tool use, token efficiency for simple problems, non-Chinese/English languages. Section G.1 states smaller models failed, bounding the approach to large-scale models. Acknowledges reward hacking as unsolved for complex tasks."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Training data details are described but the actual training data is not fully released. Section I mentions SFT and RL data will be released but at time of paper this appears incomplete ('released at xxx' placeholder). Pre-training data is proprietary web crawl data."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section B.3 describes RL data collection across all categories with counts, sources, and characteristics. Cold-start data pipeline (B.3.2) describes DeepSeek-R1-Zero generation → filtering → DeepSeek-V3 refinement → human verification. Code test case generation in B.3.1."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants in the study. ChatbotArena is an external platform. The paper uses standard benchmarks and model-generated data."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Figure 2 illustrates the full multi-stage pipeline. Section B.3.2 documents cold-start data: prompts → R1-Zero generation (temp=1.0) → correctness filtering (sympy) → format filtering (repetition, language mixing) → V3 refinement → human verification. Table 5 provides final data statistics."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding disclosure or acknowledgments section mentioning funding sources. DeepSeek is a commercial AI company but no explicit funding statement is provided."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are listed under 'DeepSeek-AI' with the email research@deepseek.com. The affiliation is clear — this is a company paper evaluating its own product."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "DeepSeek-AI is a commercial AI company evaluating its own model. The funder (DeepSeek) has a direct financial interest in positive results. No independent funding or external validation mentioned."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement. DeepSeek is a for-profit company whose commercial product is the subject of the paper. No disclosure of patents, equity, or other financial interests."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section D.1 states 'DeepSeek-V3 base has a knowledge cutoff date of July 2024, predating evaluation benchmarks like CNMO 2024.'"
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Section D.1 describes comprehensive decontamination: 'filtered out any text segments (including web pages and GitHub files) that contained matching 10-gram sequences from evaluation questions or reference solutions.' Math domain alone identified ~6 million potential texts for removal."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Section D.1 addresses contamination: n-gram filtering for pre-training and post-training data, temporal separation (math SFT from pre-2023 competitions only), and honest acknowledgment that 'n-gram based decontamination method cannot prevent the paraphrase of testset.' Section E.2 tests on AIME 2025 as fresh validation."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants study. ChatbotArena is an external crowdsourced platform, not a study run by the authors."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants study conducted by the authors."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants study conducted by the authors."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants study conducted by the authors."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants study conducted by the authors."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants study conducted by the authors."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants study conducted by the authors."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference cost, latency, or per-example token consumption reported despite the model generating thousands of thinking tokens per problem. Figure 18 shows token counts but not wall-clock time or dollar cost."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Table 7 reports total training costs: 101K H800 GPU hours for R1-Zero, 5K for SFT data creation, 41K for R1, totaling 147K GPU hours ($294K at $2/GPU-hour). Hardware specified as 64×8 H800 GPUs, with R1-Zero training taking ~198 hours."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No results reported across multiple random seeds. Training curves (Figure 1) show single runs. Evaluation uses pass@k sampling but does not report sensitivity to random seeds in training."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Section D.1 specifies: k=64 for AIME and GPQA, k=16 for MATH and CodeForces, k=8 for LCB. Pass@1 formula is provided. Reward model trained for a single epoch (B.2). Training is 10,400 steps / 1.6 epochs for R1-Zero."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search budget reported. Section B.4 lists final hyperparameters but does not describe how they were selected or how many configurations were tried. The paper mentions smaller model experiments (7B, 16B, 32B, 230B) but not systematic search."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper presents final configurations without explaining how they were selected. For example, GRPO clip ratio ε=10 is described as crucial but the selection process is not documented."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes many pairwise comparisons across ~20 benchmarks and 6+ models with t-tests (p<0.01) but no multiple comparison correction (Bonferroni, Holm, etc.) is mentioned."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "DeepSeek employees evaluate DeepSeek models against competitors. No acknowledgment of self-comparison bias. Baselines from other companies are evaluated using the authors' framework. No independent evaluation mentioned beyond ChatbotArena."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "No comparison at matched compute budgets. DeepSeek-R1 uses long chains of thought (8K-18K tokens per problem, Figure 18) vs baselines that use much less compute. The 671B model is compared against models of various sizes without compute normalization."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "No discussion of whether benchmarks like AIME, MATH, or Codeforces actually measure 'reasoning capability' as claimed. The paper equates benchmark performance with reasoning without questioning construct validity."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding involved. The model generates responses directly. SWE-Bench uses the agentless framework consistently (Section D.1)."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "Section D.1 states training cutoff is July 2024 and notes benchmarks like CNMO 2024 postdate it. Math SFT prompts sourced exclusively from pre-2023 competitions. Section E.2 tests on AIME 2025 as temporal validation."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether evaluation prompts or framework setups leak information. For example, AlpacaEval and ArenaHard use GPT-4 as judge, and the evaluation prompt format could advantage certain response styles."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether benchmark problems share structural similarities with training data. Acknowledged that web pages may contain 'OpenAI-model-generated answers' (Section A.1) but not analyzed for benchmark-specific overlap beyond n-gram filtering."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Section D.1 describes concrete decontamination: 10-gram sequence matching against evaluation questions and reference solutions, removing ~6 million pre-training texts in math alone. Post-training data undergoes same n-gram protocol. Temporal split (pre-2023 math prompts only)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Pure RL without SFT can incentivize reasoning capabilities in LLMs, with DeepSeek-R1-Zero achieving 79.8% pass@1 on AIME 2024",
    371       "evidence": "Figure 1(a) shows AIME accuracy increasing from 15.6% to 77.9% during RL training. Table 8 shows final R1 at 79.8%. Self-reflection and verification behaviors emerge naturally (Table 2, Figures 9).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "DeepSeek-R1 matches or surpasses OpenAI-o1-1217 on math and coding benchmarks",
    376       "evidence": "Table 8: R1 achieves 79.8% vs o1's 79.2% on AIME, 97.3% vs 96.4% on MATH-500, 65.9% vs 63.4% on LiveCodeBench. Codeforces: 96.3 percentile vs 96.6. Some benchmarks show o1 ahead (SimpleQA, Aider).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Distilled small models (1.5B-70B) outperform GPT-4o and Claude-3.5-Sonnet on math benchmarks",
    381       "evidence": "Table 15: Even 1.5B distilled model achieves 28.9% on AIME vs GPT-4o's 9.3% and 83.9% on MATH vs 74.6%. Larger distilled models show progressively better results.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Distillation outperforms RL alone for smaller models",
    386       "evidence": "Table 16: DeepSeek-R1-Distill-Qwen-32B (72.6% AIME) significantly outperforms Qwen2.5-32B-Zero (47.0% AIME) which used RL directly on the 32B base.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Emergent reasoning behaviors (self-reflection, verification) develop autonomously during RL training",
    391       "evidence": "Figure 9 shows 5-7x increase in reflective word frequency. Table 2 shows 'aha moment' with 'wait' usage. Figure 1(b) shows increasing response length. However, these are correlational — hard to distinguish genuine reasoning from pattern mimicry.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "The total training cost for DeepSeek-R1 is approximately $294K in GPU hours",
    396       "evidence": "Table 7 provides detailed breakdown: 101K H800 GPU hours for R1-Zero, 41K for R1, 5K for SFT data at $2/GPU-hour.",
    397       "supported": "strong"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Company evaluating own product",
    403       "detail": "DeepSeek-AI employees evaluate DeepSeek models. No independent evaluation or external audit. While ChatbotArena provides some independent signal, the main benchmark evaluations are self-conducted."
    404     },
    405     {
    406       "flag": "No variance or uncertainty reporting",
    407       "detail": "Despite using pass@k evaluation with multiple samples, no standard deviations, confidence intervals, or result stability measures are reported across any benchmark. The reader cannot assess how reliable any single number is."
    408     },
    409     {
    410       "flag": "Selective OpenAI-o1 comparison",
    411       "detail": "The paper acknowledges 'accessing the OpenAI-o1-1217 API is challenging in mainland China' and reports o1 performance 'based on official reports.' This means some comparisons use self-reported numbers from the competitor rather than controlled evaluation."
    412     },
    413     {
    414       "flag": "Missing multiple comparison correction",
    415       "detail": "T-tests with p<0.01 are run across 20+ benchmarks and 6+ model comparisons without any family-wise error rate correction. With this many comparisons, some significant results could be spurious."
    416     },
    417     {
    418       "flag": "Benchmark-reasoning gap not discussed",
    419       "detail": "The paper equates benchmark performance on AIME/MATH/Codeforces with 'reasoning capability' without discussing whether these benchmarks are valid proxies for general reasoning. The claimed 'reasoning' could be sophisticated pattern matching on competition-style problems."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Evaluating large language models trained on code",
    425       "authors": [
    426         "M. Chen",
    427         "J. Tworek"
    428       ],
    429       "year": 2021,
    430       "arxiv_id": "2107.03374",
    431       "relevance": "Introduces HumanEval benchmark and pass@k evaluation methodology used throughout this paper."
    432     },
    433     {
    434       "title": "Training language models to follow instructions with human feedback",
    435       "authors": [
    436         "L. Ouyang",
    437         "J. Wu"
    438       ],
    439       "year": 2022,
    440       "relevance": "Foundational RLHF work establishing the SFT→RL pipeline that DeepSeek-R1 modifies by skipping SFT."
    441     },
    442     {
    443       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    444       "authors": [
    445         "C. Snell",
    446         "J. Lee"
    447       ],
    448       "year": 2024,
    449       "arxiv_id": "2408.03314",
    450       "relevance": "Test-time compute scaling framework directly relevant to DeepSeek-R1's adaptive token generation strategy."
    451     },
    452     {
    453       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    454       "authors": [
    455         "J. Wei",
    456         "X. Wang"
    457       ],
    458       "year": 2022,
    459       "relevance": "Chain-of-thought prompting foundation that DeepSeek-R1 aims to surpass via RL-emergent reasoning."
    460     },
    461     {
    462       "title": "DeepSeek-V3 technical report",
    463       "authors": [
    464         "DeepSeek-AI"
    465       ],
    466       "year": 2024,
    467       "arxiv_id": "2412.19437",
    468       "relevance": "Base model for DeepSeek-R1; provides the 671B MoE architecture and pre-training details."
    469     },
    470     {
    471       "title": "Agentless: Demystifying LLM-based software engineering agents",
    472       "authors": [
    473         "C. S. Xia",
    474         "Y. Deng"
    475       ],
    476       "year": 2024,
    477       "relevance": "Framework used for SWE-Bench Verified evaluation of DeepSeek-R1."
    478     },
    479     {
    480       "title": "Let's verify step by step",
    481       "authors": [
    482         "H. Lightman",
    483         "V. Kosaraju"
    484       ],
    485       "year": 2024,
    486       "relevance": "Process reward model approach that DeepSeek-R1 considered but rejected in favor of outcome-based RL."
    487     },
    488     {
    489       "title": "Self-consistency improves chain of thought reasoning in language models",
    490       "authors": [
    491         "X. Wang",
    492         "J. Wei"
    493       ],
    494       "year": 2023,
    495       "relevance": "Self-consistency decoding used to boost DeepSeek-R1-Zero's AIME score from 77.9% to 86.7%."
    496     },
    497     {
    498       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    499       "authors": [
    500         "Z. Shao",
    501         "P. Wang"
    502       ],
    503       "year": 2024,
    504       "arxiv_id": "2402.03300",
    505       "relevance": "Introduces GRPO algorithm adopted for DeepSeek-R1 training."
    506     },
    507     {
    508       "title": "Distillation scaling laws",
    509       "authors": [
    510         "D. Busbridge"
    511       ],
    512       "year": 2025,
    513       "arxiv_id": "2502.08606",
    514       "relevance": "Scaling laws for knowledge distillation, supporting DeepSeek-R1's distillation approach."
    515     },
    516     {
    517       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    518       "authors": [
    519         "N. Jain",
    520         "K. Han"
    521       ],
    522       "year": 2024,
    523       "arxiv_id": "2403.07974",
    524       "relevance": "Key code generation benchmark used for evaluation with temporal contamination prevention."
    525     },
    526     {
    527       "title": "Constitutional AI: Harmlessness from AI feedback",
    528       "authors": [
    529         "Y. Bai",
    530         "A. Jones"
    531       ],
    532       "year": 2022,
    533       "relevance": "RLHF safety alignment methodology that DeepSeek-R1's safety training builds upon."
    534     }
    535   ],
    536   "engagement_factors": {
    537     "practical_relevance": {
    538       "score": 3,
    539       "justification": "Open-weight models (1.5B-70B) released on HuggingFace that practitioners can immediately download and use for reasoning tasks."
    540     },
    541     "surprise_contrarian": {
    542       "score": 2,
    543       "justification": "Pure RL without SFT producing emergent reasoning behaviors and matching OpenAI-o1 challenges the assumption that supervised fine-tuning on human demonstrations is necessary."
    544     },
    545     "fear_safety": {
    546       "score": 1,
    547       "justification": "Paper acknowledges jailbreak vulnerabilities and enhanced capability for dangerous content but treats safety as secondary to the technical contribution."
    548     },
    549     "drama_conflict": {
    550       "score": 3,
    551       "justification": "A Chinese lab openly challenges OpenAI's flagship reasoning model, claims comparable performance at a fraction of the cost ($294K), and releases everything under MIT license."
    552     },
    553     "demo_ability": {
    554       "score": 3,
    555       "justification": "All model weights from 1.5B to 671B are publicly available on HuggingFace with inference code and instructions, and a hosted API exists."
    556     },
    557     "brand_recognition": {
    558       "score": 3,
    559       "justification": "DeepSeek-R1 became a global news story, directly competing with OpenAI's o1, and is one of the most discussed AI releases of 2025."
    560     }
    561   }
    562 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs