scan.json (19986B)
1 { 2 "paper": { 3 "title": "Deep Reinforcement Learning that Matters", 4 "authors": ["Peter Henderson", "Riashat Islam", "Philip Bachman", "Joelle Pineau", "Doina Precup", "David Meger"], 5 "year": 2018, 6 "venue": "AAAI 2018", 7 "arxiv_id": "1709.06560" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "Code link provided: https://git.io/vFHnf (footnote 1). They also reference using publicly available OpenAI Baselines implementations." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "All experiments use publicly available OpenAI Gym MuJoCo environments (Hopper-v1, HalfCheetah-v1, Walker2d-v1, Swimmer-v1). No proprietary data was collected." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "Detailed hyperparameters for all algorithms (DDPG, PPO, TRPO, ACKTR) are provided in the supplemental material, including network architectures, learning rates, batch sizes, discount factors, and all relevant settings. Specific codebases and their versions are referenced." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": true, 29 "justification": "Code is released with a link, default hyperparameters are fully enumerated in the supplemental material, and experimental setup is described in detail including which codebases to use and which hyperparameters to vary." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "Bootstrap 95% confidence intervals are reported in Table 3. All learning curves show mean and standard error across random seeds. Tables 1 and 2 report mean ± standard error." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "2-sample t-tests, Kolmogorov-Smirnov tests, and bootstrap percent differences with 95% confidence intervals are reported in Tables 9-12 in the supplemental. The paper explicitly advocates for significance testing in RL." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Bootstrap percent differences with confidence bounds are reported (e.g., '44.47% (-80.62%, 111.72%)' for ACKTR vs DDPG on Walker2d). These provide effect size context beyond just p-values." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper explicitly discusses bootstrap power analysis to determine if sample sizes are sufficient (Section on Power Analysis). They acknowledge that 5 trials may be insufficient in some settings and demonstrate this empirically." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Standard error is reported across 5 random seeds for all experiments. Tables 1 and 2 show mean ± standard error. All graphs show mean and standard error bands." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares four policy gradient algorithms (TRPO, DDPG, PPO, ACKTR) against each other across multiple environments, codebases, and hyperparameter settings." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "All four algorithms (TRPO 2015, DDPG 2015, PPO 2017, ACKTR 2017) were contemporary state-of-the-art policy gradient methods at the time of writing." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper systematically varies individual hyperparameters (network architecture, activation functions, reward scaling, batch size) one at a time while holding others constant, which functions as ablation of these design choices." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple evaluation metrics are used and discussed: average return, maximum return, bootstrap confidence intervals, t-test statistics, KS-test statistics, and bootstrap percent differences." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "Human evaluation is not applicable to a study comparing RL algorithm reproducibility on simulated continuous control tasks." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is not a supervised learning study. RL evaluation is done via policy rollouts in the environment, which is the standard evaluation paradigm." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per algorithm (DDPG, TRPO, PPO, ACKTR), per environment (HalfCheetah, Hopper, Walker2d, Swimmer), per codebase, and per hyperparameter configuration." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Failure cases are extensively discussed: DDPG failing in unstable environments like Hopper (getting stuck at local optima), Swimmer local optima where the agent curls up rather than swimming, PPO with large networks failing completely." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The entire paper is fundamentally about negative results: showing that reported improvements may not be meaningful, that random seeds can create statistically different distributions, and that codebase differences lead to dramatically different performance." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims about variability in reported metrics, difficulty of reproduction, and the need for significance testing are all supported by the extensive experimental results across hyperparameters, seeds, environments, and codebases." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims are made about hyperparameters, seeds, and codebases affecting performance. These are justified through controlled single-variable experiments where one factor is varied while others are held constant." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper explicitly limits scope to policy gradient methods in continuous control MuJoCo domains: 'For clarity, we focus our investigation on policy gradient (PG) methods in continuous control.' The discussion acknowledges that findings may not extend to other RL settings." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses multiple explanations for observed phenomena, e.g., explaining why reward scaling affects DDPG through gradient saturation theory, why large networks may need different hyperparameters (KL divergence 33.52x higher), and why DDPG fails in unstable environments." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": false, 131 "answer": false, 132 "justification": "This paper does not use LLM APIs or pre-trained language models. The RL algorithms are trained from scratch with specified architectures." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "No prompting is used. This is a reinforcement learning study, not an LLM study." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Exhaustive hyperparameters are reported in the supplemental material for all four algorithms: network architectures, activation functions, learning rates, batch sizes, discount factors, noise parameters, optimizer settings, etc." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. The study evaluates standard RL algorithms." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Preprocessing is documented: normalized observations with running mean filter for all algorithms. Modifications to baseline implementations are described (e.g., changing DDPG evaluation to use 10 full trajectories)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "The Discussion and Conclusion section explicitly discusses limitations and open problems, including that the significance tests used 'may not be the best fit for comparing RL algorithms' and that 'further work is needed in significance testing and statistical analysis.'" 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats are discussed: the assumption that corrected significance metrics from supervised learning apply to RL may not hold; 5 trials may be insufficient; the focus on continuous control PG methods may not generalize to discrete settings or value-based methods." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Explicit scope boundaries: 'we focus our investigation on policy gradient (PG) methods in continuous control' using MuJoCo environments. The paper notes that DQN in Atari domains behaves differently and results may not transfer." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "Code is released at https://git.io/vFHnf which would allow regeneration of raw experimental data. All experiments use publicly available environments." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Data collection is fully described: 5 trials per experiment with different preset random seeds, 2M training timesteps, evaluation via average returns across last 100 trajectories. Bootstrap uses 10k iterations with pivotal method." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. This is a computational study using simulated environments." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from training to evaluation is documented: train for 2M timesteps, evaluate using final average returns across last 100 trajectories, compute bootstrap statistics with 10k iterations, run significance tests using scipy and Facebook Bootstrapped." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgements section: 'We thank NSERC, CIFAR, the Open Philanthropy Project, and the AWS Cloud Credits for Research Program.'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: McGill University and Microsoft Maluuba. The paper does not evaluate Microsoft products, so no conflict exists." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funders (NSERC, CIFAR, Open Philanthropy Project, AWS) have no financial interest in the outcome of RL reproducibility research. The paper evaluates open-source algorithms, not funder products." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "No pre-trained models are evaluated on benchmarks. All RL agents are trained from scratch." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable. RL agents are trained and evaluated in the same environment by design; there is no benchmark contamination concern for RL policy learning." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable. The study trains RL agents from scratch in simulated environments, not evaluating pre-trained models on static benchmarks." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, wall-clock time, or computational cost per experiment is reported despite running hundreds of experiments across multiple algorithms, environments, and hyperparameter configurations." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, GPU hours, or hardware specifications are reported. The acknowledgements mention AWS Cloud Credits but no quantification of compute used." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Hyperparameter choices (network architecture, activation functions, reward scaling) can significantly affect baseline algorithm performance in deep RL.", 286 "evidence": "Tables 1 and 2 show final evaluation performance varying dramatically across architectural choices. For PPO on HalfCheetah, (400,300) network achieves -1180 while (64,64) achieves 2201. DDPG reward scaling experiments (Figure 3) show failure to learn below σ=0.01.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Random seeds alone can produce statistically different performance distributions with identical hyperparameters.", 291 "evidence": "Figure 5 shows two groups of 5 seeds producing significantly different TRPO learning curves on HalfCheetah (t=-9.09, p=0.0016). Supplemental Figures 24-25 show similar results for DDPG.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Different codebases implementing the same algorithm can produce dramatically different results.", 296 "evidence": "Figures 6, 34, 35 and Tables 1-2 show TRPO and DDPG performance varying substantially across three codebases (OpenAI Baselines, rllab, original) using identical hyperparameters.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "No single algorithm consistently outperforms others across all benchmark environments.", 301 "evidence": "Figure 4 and Tables 3, 9-12 show DDPG excels on HalfCheetah but fails on Hopper, while TRPO dominates Swimmer but is weak on HalfCheetah. Significance tests confirm many pairwise comparisons are not statistically significant.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Bootstrap confidence intervals and significance testing can help assess whether reported improvements are meaningful.", 306 "evidence": "Tables 3, 9-12 demonstrate bootstrap CIs and significance tests. For example, ACKTR vs DDPG on Walker2d shows overlapping CIs (t=1.03, p=0.334) despite apparent performance difference, while TRPO vs DDPG on HalfCheetah is significant (t=-4.59, p=0.002).", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "Reproducibility in deep RL is severely affected by hyperparameter choices, random seed selection, choice of evaluation environment, and implementation codebase. The paper demonstrates that random seeds alone can create statistically different distributions, that different codebases of the same algorithm yield dramatically different results, and that no single algorithm dominates across environments. The authors advocate for bootstrap confidence intervals, significance testing, and power analysis as standard practice in RL evaluation.", 312 "red_flags": [], 313 "cited_papers": [ 314 { 315 "title": "Benchmarking deep reinforcement learning for continuous control", 316 "authors": ["Yan Duan", "Xi Chen", "Rein Houthooft", "John Schulman", "Pieter Abbeel"], 317 "year": 2016, 318 "relevance": "Foundational RL benchmarking paper providing baseline implementations used in this study; directly relevant to evaluation methodology." 319 }, 320 { 321 "title": "Proximal policy optimization algorithms", 322 "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal", "Alec Radford", "Oleg Klimov"], 323 "year": 2017, 324 "arxiv_id": "1707.06347", 325 "relevance": "Key RL algorithm evaluated in the study; demonstrates how implementation details affect reproducibility." 326 }, 327 { 328 "title": "Trust region policy optimization", 329 "authors": ["John Schulman", "Sergey Levine", "Pieter Abbeel", "Michael Jordan", "Philipp Moritz"], 330 "year": 2015, 331 "relevance": "Core algorithm evaluated across multiple codebases showing dramatic performance differences." 332 }, 333 { 334 "title": "Evaluating the replicability of significance tests for comparing learning algorithms", 335 "authors": ["Remco R. Bouckaert", "Eibe Frank"], 336 "year": 2004, 337 "relevance": "Methodological foundation for significance testing in ML that the paper adapts for RL evaluation." 338 }, 339 { 340 "title": "Machine learning that matters", 341 "authors": ["Kiri Wagstaff"], 342 "year": 2012, 343 "arxiv_id": "1206.4656", 344 "relevance": "Influential position paper on ML evaluation practices and real-world applicability that inspired the RL reproducibility investigation." 345 }, 346 { 347 "title": "Revisiting the arcade learning environment: Evaluation protocols and open problems for general agents", 348 "authors": ["Marlos C. Machado", "Marc G. Bellemare", "Erik Talvitie", "Joel Veness", "Matthew Hausknecht", "Michael Bowling"], 349 "year": 2017, 350 "arxiv_id": "1709.06009", 351 "relevance": "Proposes better evaluation methods for RL benchmarks, complementary to this paper's reproducibility focus." 352 }, 353 { 354 "title": "Inference for the generalization error", 355 "authors": ["Claude Nadeau", "Yoshua Bengio"], 356 "year": 2000, 357 "relevance": "Statistical methodology for comparing ML algorithms that informs the significance testing approach proposed here." 358 } 359 ] 360 }