scan.json (22053B)
1 { 2 "paper": { 3 "title": "Dynamic Memory Management on GPUs with SYCL", 4 "authors": ["Russell K. Standish"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2504.18211", 8 "doi": "10.48550/arXiv.2504.18211" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "The SYCL port of Ouroboros dynamic memory allocator is functional and achieves performance within a factor of 2 of the original CUDA code for page-based algorithms, and within statistical noise for chunk-based algorithms using Intel's oneAPI. Adaptive C++ suffered from timeouts and deadlocks. The work also identified several SYCL deficiencies relative to CUDA, including lack of global nd_item access, masked group operations, and nanosleep equivalents.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository provided: https://github.com/highperformancecoder/Ouroboros-SYCL (footnote 1, Section 3). SYCL code in master branch, CUDA code in cuda-ouroboros branch, deoptimised version in deoptimised branch." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Raw results files are available in supplementary materials at https://osf.io/2zwrt/ (reference [7], Section 4)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Section 3 specifies: Intel oneAPI 2025.1 (icpx compiler), Codeplay's oneAPI for NVIDIA GPUs plugin, CUDA 12.8, Adaptive C++ commit f336ab84. Hardware specified as Dell Precision 7540 with i9-9880H and Quadro T2000, and Asus NUC 13 with i5-1340P and Iris Xe." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "Section 3 mentions cmake and ccmake steps with compiler flags, but there are no step-by-step reproduction instructions or README described. The paper gives fragments (compiler flags, branch names) but not a complete guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Figures show average allocation times but no error bars, confidence intervals, or uncertainty measures are visible or described." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims SYCL performance is 'within a factor of 2' and 'within statistical noise' of CUDA without any statistical tests to support these comparative claims." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports relative performance differences: 'about half that of the CUDA code' for page allocators, and 'within statistical noise' for chunk allocators, with baseline context provided in figures showing absolute timing values." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper iterates 10 times and reports averages, but does not justify why 10 iterations were chosen or discuss whether this is sufficient for reliable performance measurement." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper reports averages over 10 iterations (and averages over subsequent 9 iterations), but no standard deviation, variance, or spread measure is provided." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The original CUDA Ouroboros implementation serves as the baseline. A 'deoptimised' CUDA version is also included for fairer comparison (Section 3)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Ouroboros (2020) is the state-of-the-art dynamic GPU memory allocator according to the survey by Winter and Mlakar (2021). The original code is the appropriate baseline for a porting study." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The deoptimised CUDA version serves as an ablation — it removes CUDA-specific optimizations (embedded PTX, nanosleep, masked warp functions) to isolate the effect of language translation vs. optimization differences." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Performance is measured both as a function of allocation size and as a function of number of simultaneous allocations, across six different allocator algorithms (page, chunk, virtual array page/chunk, virtual list page/chunk)." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is irrelevant for a systems performance benchmarking paper measuring allocation times." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a systems performance benchmark, not a machine learning evaluation. There is no train/test split concept." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by all six allocator types (page, chunk, virtual array page/chunk, virtual list page/chunk) in Figures 1-6, and by multiple platforms (CUDA, deoptimised CUDA, Adaptive C++, oneAPI on Intel, oneAPI on NVIDIA)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses Adaptive C++ struggling with timeouts and deadlocks as thread count increased (Section 4), the active mask deadlock on NVIDIA GPUs (Section 2), and SYCLomatic's failure to generate compilable code." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Multiple negative results reported: Adaptive C++ timeouts/deadlocks, active mask code deadlocking on NVIDIA (works on Intel/CPU but not NVIDIA), deoptimised CUDA unexpectedly becoming more performant rather than slower (Section 4.1)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims the port enables cross-platform comparison and testing on non-CUDA platforms. The results in Section 4 support this with timing comparisons across CUDA, oneAPI on NVIDIA, oneAPI on Intel, and Adaptive C++." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's main causal claim is that SYCL translation causes a performance overhead. The controlled comparison (same hardware, same algorithm, CUDA vs SYCL via different compilers) is an adequate design for this claim. The deoptimised CUDA variant further controls for optimization differences." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Claims are bounded to the specific hardware tested (Quadro T2000, Iris Xe), specific compilers (oneAPI 2025.1, Adaptive C++), and the Ouroboros algorithms. The paper does not overclaim to all GPU memory management." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses JIT compilation as an alternative explanation for performance differences (Section 3), creates the deoptimised CUDA version to control for CUDA-specific optimizations, and notes the unexpected result of deoptimised CUDA being faster (Section 4.1)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures allocation/free times directly, which is exactly what it claims to measure. No proxy gap exists — performance claims match the granularity of measurements." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": false, 141 "answer": false, 142 "justification": "This paper does not use any ML models or LLMs. It is a systems programming paper." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "This paper does not use prompting. It is a systems programming paper." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Key experimental parameters are reported: allocation sizes, number of allocations (1024 for size experiments, 1000 bytes for thread scaling), 10 iterations, heap space configuration. Compiler flags are specified in Section 3." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. This is a systems programming paper." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3 describes the modification to separate first-iteration (JIT) timing from subsequent iterations, reporting both 'average over all iterations' and 'average over all but the first iteration' for fair comparison." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations or threats-to-validity section. Some limitations are mentioned in passing (e.g., Adaptive C++ issues, SYCL deficiencies in the Conclusion) but there is no substantive discussion." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity are discussed. The paper does not address potential confounds such as compiler optimization levels, OS scheduling effects, or whether the specific GPU models are representative." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state scope boundaries or what the results do NOT show. It does not discuss limitations of testing on only two hardware configurations or the limited compiler versions tested." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Raw results files are available in supplementary materials at OSF (reference [7]: https://osf.io/2zwrt/), along with a Ravel file for data analysis." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3 describes the benchmark driver programs, parameters (allocation size, number of allocations), iteration count (10), and timing methodology (average over all vs. subsequent iterations)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from automated performance benchmarks on specific hardware." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is straightforward: run driver programs with specified parameters → record allocation/free times over 10 iterations → compute averages. The modification to separate JIT timing is documented in Section 3." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed. The author is affiliated with 'High Performance Coders' but no funding acknowledgment is present." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliation is listed as 'High Performance Coders'. The paper evaluates open-source tools (SYCL compilers) rather than the author's own commercial product." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. The author's company 'High Performance Coders' may have a commercial interest in cross-platform GPU programming." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present. The author runs 'High Performance Coders' which could have financial interests related to SYCL consulting, but this is not disclosed." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This paper does not evaluate any pre-trained ML model on a benchmark. It is a systems performance benchmarking paper comparing CUDA and SYCL implementations." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "No pre-trained model is evaluated. This is a systems programming paper." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No pre-trained model is evaluated. This is a systems programming paper." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "This is a systems programming paper, not an ML inference paper. Cost in the ML sense is irrelevant." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The total computational budget for running the benchmarks is not stated. Hardware is specified but total time/compute spent is not quantified." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No random seeds are involved in the deterministic memory allocation benchmarks, but the paper does not discuss run-to-run variability or whether results are deterministic." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 3 states 'the program iterates ten times' and reports averages over all iterations and over subsequent iterations (excluding the first)." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": false, 305 "answer": false, 306 "justification": "No hyperparameter search is involved. The experiments use fixed parameters from the Ouroboros driver programs." 307 }, 308 "best_config_selection_justified": { 309 "applies": false, 310 "answer": false, 311 "justification": "No configuration selection is performed. All six allocator types and all platforms are reported." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The author ported the code and evaluates their own port against the original. No discussion of potential author-evaluation bias in the SYCL implementation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "Compute budget differences are negligible — all implementations run on the same hardware with the same workloads." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "The benchmarks directly measure allocation/free times, which is exactly what the paper claims to evaluate. The paper also notes the Ouroboros benchmarks include data correctness checks (write data, verify on readback)." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. This is a systems programming paper." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No ML model is being evaluated. This is a systems performance benchmark with no training data concept." 344 }, 345 "feature_leakage_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "No ML model is being evaluated. No feature/label relationship exists." 349 }, 350 "non_independence_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "No ML model is being evaluated. Benchmark runs are independent by nature." 354 }, 355 "leakage_detection_method": { 356 "applies": false, 357 "answer": false, 358 "justification": "No ML model is being evaluated. Data leakage is not a concept applicable to systems performance benchmarking." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "The SYCL port of Ouroboros is functional and produces correct results across all six allocator types.", 365 "evidence": "Section 4: 'data is written to the allocated chunks and checked' — the driver programs verify data correctness after allocation.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "SYCL page-based allocators achieve about half the performance of the original CUDA implementation.", 370 "evidence": "Section 4.1 and Figure 1: 'The performance of the SYCL code ends up being about half that of the CUDA code.'", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "SYCL chunk-based allocators achieve performance broadly in line with the original CUDA implementation.", 375 "evidence": "Section 4.2 and Figure 2: 'the implementation performance is broadly in line with the original Ouroboros implementation when run on the same hardware.'", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Deoptimising the CUDA code did not make it slower; it may have made it faster.", 380 "evidence": "Section 4.1: 'the attempt to deoptimise the CUDA code to make it more comparable to the SYCL version only seem to make it more performant, if anything.'", 381 "supported": "weak" 382 }, 383 { 384 "claim": "Adaptive C++ suffered from timeouts and deadlocks at higher thread counts.", 385 "evidence": "Section 4: 'the Adaptive C++ compiled code would struggle as the number of threads increased, with loops timing out or becoming deadlocked.'", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No error bars or uncertainty quantification", 392 "detail": "Performance figures show only average times with no error bars, standard deviation, or confidence intervals. Claims of 'within statistical noise' are made without any statistical test." 393 }, 394 { 395 "flag": "Very limited hardware tested", 396 "detail": "Only two hardware configurations tested (one NVIDIA laptop GPU, one Intel integrated GPU). No high-end or datacenter GPUs tested, limiting generalizability of performance claims." 397 }, 398 { 399 "flag": "No limitations section", 400 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries." 401 }, 402 { 403 "flag": "Single author self-evaluation", 404 "detail": "The author ported the code and evaluates their own port. No independent evaluation or acknowledgment of potential bias in the implementation quality." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "Ouroboros: virtualized queues for dynamic memory management on GPUs", 410 "authors": ["Martin Winter", "Daniel Mlakar", "Mathias Parger", "Markus Steinberger"], 411 "year": 2020, 412 "relevance": "The original CUDA implementation that this paper ports to SYCL; state-of-the-art GPU dynamic memory allocator." 413 }, 414 { 415 "title": "Are dynamic memory managers on GPUs slow? a survey and benchmarks", 416 "authors": ["Martin Winter", "Mathias Parger", "Daniel Mlakar", "Markus Steinberger"], 417 "year": 2021, 418 "relevance": "Survey and benchmark of GPU dynamic memory allocation approaches, establishing Ouroboros as most performant." 419 }, 420 { 421 "title": "Using SYCLomatic to migrate CUDA code to oneAPI adapting NVIDIA GPU", 422 "authors": ["Wentao Liang", "Norihisa Fujita", "Ryohei Kobayashi", "Taisuke Boku"], 423 "year": 2024, 424 "relevance": "Automatic CUDA-to-SYCL translation tool used as starting point for the port." 425 } 426 ] 427 }