alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
AtomicUniformCudaHip.hpp
Go to the documentation of this file.
1/* Copyright 2022 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11
12#include <type_traits>
13
14#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
15
16namespace alpaka
17{
18 //! The GPU CUDA/HIP accelerator atomic ops.
19 //
20 // Atomics can be used in the hierarchy level grids, blocks and threads.
21 // Atomics are not guaranteed to be safe between devices.
25} // namespace alpaka
26
27# if !defined(ALPAKA_HOST_ONLY)
28
29# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
30# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
31# endif
32
33# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
34# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
35# endif
36
37//! clang is providing a builtin for different atomic functions even if these is not supported for architectures < 6.0
38# define CLANG_CUDA_PTX_WORKAROUND \
39 (BOOST_COMP_CLANG && BOOST_LANG_CUDA && BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(6, 0, 0))
40
41//! These types must be in the global namespace for checking existence of respective functions in global namespace via
42//! SFINAE, so we use inline namespace.
43inline namespace alpakaGlobal
44{
45 //! Provide an interface to builtin atomic functions.
46 //
47 // To check for the existence of builtin functions located in the global namespace :: directly.
48 // This would not be possible without having these types in global namespace.
49 // If the functor is inheriting from std::false_type an signature is explicitly not available. This can be used to
50 // explicitly disable builtin function in case the builtin is broken.
51 // If the functor is inheriting from std::true_type a specialization must implement one of the following
52 // interfaces.
53 // \code{.cpp}
54 // // interface for all atomics except atomicCas
55 // __device__ static T atomic( T* add, T value);
56 // // interface for atomicCas only
57 // __device__ static T atomic( T* add, T compare, T value);
58 // \endcode
59 template<typename TOp, typename T, typename THierarchy, typename TSfinae = void>
60 struct AlpakaBuiltInAtomic : std::false_type
61 {
62 };
63
64 // Cas.
65 template<typename T, typename THierarchy>
66 struct AlpakaBuiltInAtomic<
67 alpaka::AtomicCas,
68 T,
69 THierarchy,
70 typename std::void_t<
71 decltype(atomicCAS(alpaka::core::declval<T*>(), alpaka::core::declval<T>(), alpaka::core::declval<T>()))>>
72 : std::true_type
73 {
74 static __device__ T atomic(T* add, T compare, T value)
75 {
76 return atomicCAS(add, compare, value);
77 }
78 };
79
80# if !CLANG_CUDA_PTX_WORKAROUND
81 template<typename T>
82 struct AlpakaBuiltInAtomic<
83 alpaka::AtomicCas,
84 T,
86 typename std::void_t<decltype(atomicCAS_block(
87 alpaka::core::declval<T*>(),
88 alpaka::core::declval<T>(),
89 alpaka::core::declval<T>()))>> : std::true_type
90 {
91 static __device__ T atomic(T* add, T compare, T value)
92 {
93 return atomicCAS_block(add, compare, value);
94 }
95 };
96# endif
97
98
99 // Add.
100 template<typename T, typename THierarchy>
101 struct AlpakaBuiltInAtomic<
102 alpaka::AtomicAdd,
103 T,
104 THierarchy,
105 typename std::void_t<decltype(atomicAdd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
106 : std::true_type
107 {
108 static __device__ T atomic(T* add, T value)
109 {
110 return atomicAdd(add, value);
111 }
112 };
113
114
115# if !CLANG_CUDA_PTX_WORKAROUND
116 template<typename T>
117 struct AlpakaBuiltInAtomic<
118 alpaka::AtomicAdd,
119 T,
121 typename std::void_t<decltype(atomicAdd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
122 : std::true_type
123 {
124 static __device__ T atomic(T* add, T value)
125 {
126 return atomicAdd_block(add, value);
127 }
128 };
129# endif
130
131# if CLANG_CUDA_PTX_WORKAROUND
132 // clang is providing a builtin for atomicAdd even if these is not supported by the current architecture
133 template<typename THierarchy>
134 struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, double, THierarchy> : std::false_type
135 {
136 };
137# endif
138
139# if(BOOST_LANG_HIP)
140 // HIP shows bad performance with builtin atomicAdd(float*,float) for the hierarchy threads therefore we do not
141 // call the buildin method and instead use the atomicCAS emulation. For details see:
142 // https://github.com/alpaka-group/alpaka/issues/1657
143 template<>
144 struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, float, alpaka::hierarchy::Threads> : std::false_type
145 {
146 };
147# endif
148
149 // Sub.
150
151 template<typename T, typename THierarchy>
152 struct AlpakaBuiltInAtomic<
153 alpaka::AtomicSub,
154 T,
155 THierarchy,
156 typename std::void_t<decltype(atomicSub(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
157 : std::true_type
158 {
159 static __device__ T atomic(T* add, T value)
160 {
161 return atomicSub(add, value);
162 }
163 };
164
165# if !CLANG_CUDA_PTX_WORKAROUND
166 template<typename T>
167 struct AlpakaBuiltInAtomic<
168 alpaka::AtomicSub,
169 T,
171 typename std::void_t<decltype(atomicSub_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
172 : std::true_type
173 {
174 static __device__ T atomic(T* add, T value)
175 {
176 return atomicSub_block(add, value);
177 }
178 };
179# endif
180
181 // Min.
182 template<typename T, typename THierarchy>
183 struct AlpakaBuiltInAtomic<
184 alpaka::AtomicMin,
185 T,
186 THierarchy,
187 typename std::void_t<decltype(atomicMin(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
188 : std::true_type
189 {
190 static __device__ T atomic(T* add, T value)
191 {
192 return atomicMin(add, value);
193 }
194 };
195
196# if !CLANG_CUDA_PTX_WORKAROUND
197 template<typename T>
198 struct AlpakaBuiltInAtomic<
199 alpaka::AtomicMin,
200 T,
202 typename std::void_t<decltype(atomicMin_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
203 : std::true_type
204 {
205 static __device__ T atomic(T* add, T value)
206 {
207 return atomicMin_block(add, value);
208 }
209 };
210# endif
211
212// disable HIP atomicMin: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
213# if(BOOST_LANG_HIP)
214 template<typename THierarchy>
215 struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, THierarchy> : std::false_type
216 {
217 };
218
219 template<>
220 struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, alpaka::hierarchy::Threads> : std::false_type
221 {
222 };
223
224 template<typename THierarchy>
225 struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, THierarchy> : std::false_type
226 {
227 };
228
229 template<>
230 struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, alpaka::hierarchy::Threads> : std::false_type
231 {
232 };
233
234# if !__has_builtin(__hip_atomic_compare_exchange_strong)
235 template<typename THierarchy>
236 struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, THierarchy> : std::false_type
237 {
238 };
239
240 template<>
241 struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
242 {
243 };
244# endif
245# endif
246
247 // Max.
248
249 template<typename T, typename THierarchy>
250 struct AlpakaBuiltInAtomic<
251 alpaka::AtomicMax,
252 T,
253 THierarchy,
254 typename std::void_t<decltype(atomicMax(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
255 : std::true_type
256 {
257 static __device__ T atomic(T* add, T value)
258 {
259 return atomicMax(add, value);
260 }
261 };
262
263# if !CLANG_CUDA_PTX_WORKAROUND
264 template<typename T>
265 struct AlpakaBuiltInAtomic<
266 alpaka::AtomicMax,
267 T,
269 typename std::void_t<decltype(atomicMax_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
270 : std::true_type
271 {
272 static __device__ T atomic(T* add, T value)
273 {
274 return atomicMax_block(add, value);
275 }
276 };
277# endif
278
279 // disable HIP atomicMax: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
280# if(BOOST_LANG_HIP)
281 template<typename THierarchy>
282 struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, THierarchy> : std::false_type
283 {
284 };
285
286 template<>
287 struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, alpaka::hierarchy::Threads> : std::false_type
288 {
289 };
290
291 template<typename THierarchy>
292 struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, THierarchy> : std::false_type
293 {
294 };
295
296 template<>
297 struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, alpaka::hierarchy::Threads> : std::false_type
298 {
299 };
300
301# if !__has_builtin(__hip_atomic_compare_exchange_strong)
302 template<typename THierarchy>
303 struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, THierarchy> : std::false_type
304 {
305 };
306
307 template<>
308 struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
309 {
310 };
311# endif
312# endif
313
314
315 // Exch.
316
317 template<typename T, typename THierarchy>
318 struct AlpakaBuiltInAtomic<
319 alpaka::AtomicExch,
320 T,
321 THierarchy,
322 typename std::void_t<decltype(atomicExch(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
323 : std::true_type
324 {
325 static __device__ T atomic(T* add, T value)
326 {
327 return atomicExch(add, value);
328 }
329 };
330
331# if !CLANG_CUDA_PTX_WORKAROUND
332 template<typename T>
333 struct AlpakaBuiltInAtomic<
334 alpaka::AtomicExch,
335 T,
337 typename std::void_t<decltype(atomicExch_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
338 : std::true_type
339 {
340 static __device__ T atomic(T* add, T value)
341 {
342 return atomicExch_block(add, value);
343 }
344 };
345# endif
346
347 // Inc.
348
349 template<typename T, typename THierarchy>
350 struct AlpakaBuiltInAtomic<
351 alpaka::AtomicInc,
352 T,
353 THierarchy,
354 typename std::void_t<decltype(atomicInc(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
355 : std::true_type
356 {
357 static __device__ T atomic(T* add, T value)
358 {
359 return atomicInc(add, value);
360 }
361 };
362
363# if !CLANG_CUDA_PTX_WORKAROUND
364 template<typename T>
365 struct AlpakaBuiltInAtomic<
366 alpaka::AtomicInc,
367 T,
369 typename std::void_t<decltype(atomicInc_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
370 : std::true_type
371 {
372 static __device__ T atomic(T* add, T value)
373 {
374 return atomicInc_block(add, value);
375 }
376 };
377# endif
378
379 // Dec.
380
381 template<typename T, typename THierarchy>
382 struct AlpakaBuiltInAtomic<
383 alpaka::AtomicDec,
384 T,
385 THierarchy,
386 typename std::void_t<decltype(atomicDec(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
387 : std::true_type
388 {
389 static __device__ T atomic(T* add, T value)
390 {
391 return atomicDec(add, value);
392 }
393 };
394
395# if !CLANG_CUDA_PTX_WORKAROUND
396 template<typename T>
397 struct AlpakaBuiltInAtomic<
398 alpaka::AtomicDec,
399 T,
401 typename std::void_t<decltype(atomicDec_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
402 : std::true_type
403 {
404 static __device__ T atomic(T* add, T value)
405 {
406 return atomicDec_block(add, value);
407 }
408 };
409# endif
410
411 // And.
412
413 template<typename T, typename THierarchy>
414 struct AlpakaBuiltInAtomic<
415 alpaka::AtomicAnd,
416 T,
417 THierarchy,
418 typename std::void_t<decltype(atomicAnd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
419 : std::true_type
420 {
421 static __device__ T atomic(T* add, T value)
422 {
423 return atomicAnd(add, value);
424 }
425 };
426
427# if !CLANG_CUDA_PTX_WORKAROUND
428 template<typename T>
429 struct AlpakaBuiltInAtomic<
430 alpaka::AtomicAnd,
431 T,
433 typename std::void_t<decltype(atomicAnd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
434 : std::true_type
435 {
436 static __device__ T atomic(T* add, T value)
437 {
438 return atomicAnd_block(add, value);
439 }
440 };
441# endif
442
443 // Or.
444
445 template<typename T, typename THierarchy>
446 struct AlpakaBuiltInAtomic<
447 alpaka::AtomicOr,
448 T,
449 THierarchy,
450 typename std::void_t<decltype(atomicOr(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
451 : std::true_type
452 {
453 static __device__ T atomic(T* add, T value)
454 {
455 return atomicOr(add, value);
456 }
457 };
458
459# if !CLANG_CUDA_PTX_WORKAROUND
460 template<typename T>
461 struct AlpakaBuiltInAtomic<
462 alpaka::AtomicOr,
463 T,
465 typename std::void_t<decltype(atomicOr_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
466 : std::true_type
467 {
468 static __device__ T atomic(T* add, T value)
469 {
470 return atomicOr_block(add, value);
471 }
472 };
473# endif
474
475 // Xor.
476
477 template<typename T, typename THierarchy>
478 struct AlpakaBuiltInAtomic<
479 alpaka::AtomicXor,
480 T,
481 THierarchy,
482 typename std::void_t<decltype(atomicXor(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
483 : std::true_type
484 {
485 static __device__ T atomic(T* add, T value)
486 {
487 return atomicXor(add, value);
488 }
489 };
490
491# if !CLANG_CUDA_PTX_WORKAROUND
492 template<typename T>
493 struct AlpakaBuiltInAtomic<
494 alpaka::AtomicXor,
495 T,
497 typename std::void_t<decltype(atomicXor_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
498 : std::true_type
499 {
500 static __device__ T atomic(T* add, T value)
501 {
502 return atomicXor_block(add, value);
503 }
504 };
505# endif
506
507} // namespace alpakaGlobal
508
509# undef CLANG_CUDA_PTX_WORKAROUND
510# endif
511
512#endif
The GPU CUDA/HIP accelerator atomic ops.
These types must be in the global namespace for checking existence of respective functions in global ...
The alpaka accelerator library.