ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19704B)


      1 {
      2   "paper": {
      3     "title": "DAPO: An Open-Source LLM Reinforcement Learning System at Scale",
      4     "authors": ["Qiying Yu", "Zheng Zhang", "Ruofei Zhu", "Yufeng Yuan", "Xiaochen Zuo", "Yu Yue", "Weinan Dai", "Tiantian Fan", "Gaohong Liu", "Juncai Liu", "Lingjun Liu", "Xin Liu", "Haibin Lin", "Zhiqi Lin", "Bole Ma", "Guangming Sheng", "Yuxuan Tong", "Chi Zhang", "Mofan Zhang", "Ru Zhang", "Wang Zhang", "Hang Zhu", "Jinhua Zhu", "Hao Zhou", "Jingjing Liu", "Wei-Ying Ma", "Ya-Qin Zhang", "Lin Yan", "Mu Qiao", "Yonghui Wu", "Mingxuan Wang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2503.14476"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states it is built on the verl framework (https://github.com/volcengine/verl) and provides a project page (https://dapo-sia.github.io/). The abstract states 'we open-source our training code.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section 3.5 describes the DAPO-Math-17K dataset and the abstract states they open-source 'a carefully curated and processed dataset.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup with library versions is provided in the paper. Only the verl framework is mentioned."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The paper references a project page but does not include commands or a README-style guide."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., 50 points on AIME 2024). No confidence intervals or error bars are provided for the main results in Table 1."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims DAPO outperforms DeepSeek-R1-Zero-Qwen-32B (50 vs 47) without any statistical significance test."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 1 shows progressive improvements with baseline context: Naive GRPO at 30, each technique adding points up to DAPO at 50, compared to DeepSeek-R1-Zero-Qwen-32B at 47. Absolute improvements are clear from context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for evaluation sample sizes. AIME 2024 has only 30 problems. No power analysis or discussion of whether this is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper reports avg@32 (averaging over 32 repetitions) but does not report standard deviation, IQR, or any spread measure across these repetitions."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 compares against Naive GRPO and DeepSeek-R1-Zero-Qwen-32B."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "DeepSeek-R1-Zero-Qwen-32B (2025) is a contemporary baseline representing the state of the art at the time."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 1 shows a progressive ablation, adding each technique incrementally: Overlong Filtering (+6), Clip-Higher (+2), Soft Overlong Punishment (+3), Token-level Loss (+1), Dynamic Sampling (+8)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports avg@32, pass@32, and cons@32 on AIME 2024 (Figure 1)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "The paper evaluates math reasoning via automated rule-based verification. Human evaluation is not relevant for verifiable math correctness."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "AIME 2024 is used as the evaluation benchmark, separate from the DAPO-Math-17K training dataset. Section 4.3 also distinguishes training reward from validation accuracy."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Only aggregate AIME 2024 accuracy is reported. No per-problem or per-difficulty breakdown is provided."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.3 discusses entropy collapse, overfitting to training set, and Section 3 discusses problems like gibberish/repetitive generation and reward noise from truncated samples."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that naive GRPO achieved only 30 points (vs 47 target), discusses entropy collapse, and notes that training reward 'often exhibits little correlation with the accuracy on the validation set, which indicates overfitting.'"
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 50 points on AIME 2024 using Qwen2.5-32B, which is supported by Figure 1 and Table 1."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about each technique's contribution are supported by progressive ablation in Table 1, where each component is added incrementally in a controlled manner."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'LLM Reinforcement Learning System at Scale' but results are only on AIME 2024 math problems with a single base model (Qwen2.5-32B). Section 4.1 mentions it 'can be readily transferred to other tasks' without evidence."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the improvements. For example, the progressive ablation order could affect results, and no consideration of whether improvements are due to hyperparameter tuning rather than the proposed techniques."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies Qwen2.5-32B as the base model, which is a specific versioned model name."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This is an RL training paper, not a prompting paper. The method uses rule-based reward signals, not prompt engineering."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.1 reports learning rate (1e-6), warm-up steps (20), prompt batch size (512), samples per prompt (16), mini-batch size (512), max length (20480), epsilon_low (0.2), epsilon_high (0.28), temperature (1.0), top_p (0.7)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a direct RL training system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.5 and Appendix A describe the dataset transformation process: sourcing from web/competitions, transforming answers to integers, using LLM-based reformulation with CoT reasoning. An example is provided."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section is present in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. Section 4.1 briefly says 'we focus specifically on mathematical tasks' but does not bound the generalization claims made elsewhere."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper states the dataset is open-sourced along with training code, enabling independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.5 describes data sourcing: 'sourced from the web and official competition homepages through a combination of web scraping and manual annotation.'"
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is math problems from public sources."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.5 and Appendix A describe the pipeline: source collection, answer format transformation to integers using LLM, resulting in DAPO-Math-17K with 17K prompts."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding sources are disclosed. Authors are from ByteDance Seed and Tsinghua University but no explicit funding statement is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: ByteDance Seed, Tsinghua University AIR, The University of Hong Kong, and SIA-Lab."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "ByteDance is a major tech company with commercial interest in LLM capabilities. No discussion of whether the funder has a stake in the results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The training data cutoff of Qwen2.5-32B is not stated. The paper does not discuss when the base model's pretraining data was collected."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether AIME 2024 problems could appear in Qwen2.5-32B's pretraining data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "AIME 2024 problems are publicly available and could be in the pretraining data of Qwen2.5-32B. This is not addressed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost or latency figures are reported despite the method involving large-scale RL training and generation of up to 20,480 tokens per sample."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No GPU hours, total training time, or hardware specifications are reported despite this being a large-scale training system."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "DAPO achieves 50 points on AIME 2024 using Qwen2.5-32B base model",
    286       "evidence": "Figure 1 and Table 1 show AIME 2024 avg@32 score of 50.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "DAPO outperforms DeepSeek-R1-Zero-Qwen-32B (47 points) using 50% training steps",
    291       "evidence": "Figure 1 shows DAPO reaching higher accuracy in fewer steps. Table 1 shows 50 vs 47.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Each of the four key techniques contributes to improved performance",
    296       "evidence": "Table 1 shows progressive ablation: Naive GRPO 30 → +Overlong Filtering 36 → +Clip-Higher 38 → +Soft Overlong Punishment 41 → +Token-level Loss 42 → +Dynamic Sampling 50.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Clip-Higher strategy effectively addresses entropy collapse",
    301       "evidence": "Figure 2 shows entropy maintained with Clip-Higher vs collapse without it, along with accuracy improvement.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Reflective/backtracking reasoning behaviors emerge during RL training",
    306       "evidence": "Section 4.4 and Table 2 show qualitative examples of reflection emerging ('wait a moment, let's rethink').",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "DAPO proposes four techniques (Clip-Higher, Dynamic Sampling, Token-Level Policy Gradient Loss, Overlong Reward Shaping) that improve GRPO-based RL training for LLM reasoning, achieving 50 points on AIME 2024 with Qwen2.5-32B compared to DeepSeek-R1-Zero's 47 points. The progressive ablation shows each technique contributes, with Dynamic Sampling providing the largest single improvement (+8 points). The work is notable for open-sourcing training code, algorithm details, and a curated 17K math dataset, aiming to improve reproducibility in the LLM RL space.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance or uncertainty quantification",
    315       "detail": "Despite using avg@32 (32 repetitions), no standard deviation or confidence interval is reported. A 3-point improvement (47→50) on 30 AIME problems could be within noise."
    316     },
    317     {
    318       "flag": "Single benchmark evaluation",
    319       "detail": "All results are on AIME 2024 only (30 problems). No evaluation on other math benchmarks (MATH, GSM8K, etc.) or other domains despite claims of general RL system."
    320     },
    321     {
    322       "flag": "Progressive ablation order dependency",
    323       "detail": "The ablation in Table 1 adds techniques in a fixed order. Individual technique contributions may differ if added in different orders or combinations, but no such analysis is provided."
    324     },
    325     {
    326       "flag": "No compute cost disclosure",
    327       "detail": "A paper about 'large-scale RL' does not report GPU hours, hardware, or training cost, making practical reproducibility assessment impossible."
    328     },
    329     {
    330       "flag": "Benchmark contamination risk",
    331       "detail": "AIME 2024 problems are publicly available and could be in Qwen2.5-32B's pretraining data. No contamination analysis is performed."
    332     },
    333     {
    334       "flag": "No limitations section",
    335       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    341       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    342       "year": 2025,
    343       "arxiv_id": "2501.12948",
    344       "relevance": "Primary baseline and motivation; key work on RL for LLM reasoning whose results DAPO aims to reproduce and surpass."
    345     },
    346     {
    347       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    348       "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"],
    349       "year": 2024,
    350       "arxiv_id": "2402.03300",
    351       "relevance": "Introduces GRPO algorithm that DAPO builds upon and modifies."
    352     },
    353     {
    354       "title": "Proximal Policy Optimization Algorithms",
    355       "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal"],
    356       "year": 2017,
    357       "arxiv_id": "1707.06347",
    358       "relevance": "Foundation RL algorithm that DAPO's clipping mechanism extends."
    359     },
    360     {
    361       "title": "Training language models to follow instructions with human feedback",
    362       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    363       "year": 2022,
    364       "relevance": "Foundational RLHF work that DAPO's approach relates to for LLM alignment."
    365     },
    366     {
    367       "title": "HybridFlow: A Flexible and Efficient RLHF Framework",
    368       "authors": ["Guangming Sheng", "Chi Zhang"],
    369       "year": 2024,
    370       "arxiv_id": "2409.19256",
    371       "relevance": "The verl framework that DAPO's implementation is built upon."
    372     },
    373     {
    374       "title": "RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning",
    375       "authors": ["Jonas Gehring", "Kunhao Zheng"],
    376       "year": 2025,
    377       "relevance": "Related work applying RL with execution feedback for code LLMs."
    378     },
    379     {
    380       "title": "Kimi k1.5: Scaling Reinforcement Learning with LLMs",
    381       "authors": ["Kimi Team"],
    382       "year": 2025,
    383       "arxiv_id": "2501.12599",
    384       "relevance": "Contemporary work on scaling RL for LLM reasoning, cited as withholding key training details."
    385     },
    386     {
    387       "title": "VinePPO: Unlocking RL Potential for LLM Reasoning through Refined Credit Assignment",
    388       "authors": ["Amirhossein Kazemnejad", "Milad Aghajohari"],
    389       "year": 2024,
    390       "arxiv_id": "2410.01679",
    391       "relevance": "Related approach to improving credit assignment in RL for LLM reasoning."
    392     },
    393     {
    394       "title": "An Empirical Study on Eliciting and Improving R1-like Reasoning Models",
    395       "authors": ["Zhipeng Chen", "Yingqian Min"],
    396       "year": 2025,
    397       "arxiv_id": "2503.04548",
    398       "relevance": "Community effort to reproduce DeepSeek R1 results, documenting similar challenges to those DAPO addresses."
    399     },
    400     {
    401       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    402       "authors": ["Noah Shinn", "Federico Cassano"],
    403       "year": 2023,
    404       "relevance": "Related work on using reinforcement learning signals for LLM self-improvement in code generation."
    405     }
    406   ]
    407 }

Impressum · Datenschutz