alpaka
Abstraction Library for Parallel Kernel Acceleration
AtomicUniformCudaHip.hpp
Go to the documentation of this file.
1 /* Copyright 2022 RenĂ© Widera
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/atomic/Op.hpp"
10 #include "alpaka/core/Utility.hpp"
11 
12 #include <type_traits>
13 
14 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
15 
16 namespace alpaka
17 {
18  //! The GPU CUDA/HIP accelerator atomic ops.
19  //
20  // Atomics can be used in the hierarchy level grids, blocks and threads.
21  // Atomics are not guaranteed to be safe between devices.
23  {
24  };
25 } // namespace alpaka
26 
27 # if !defined(ALPAKA_HOST_ONLY)
28 
29 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
30 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
31 # endif
32 
33 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
34 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
35 # endif
36 
37 //! clang is providing a builtin for different atomic functions even if these is not supported for architectures < 6.0
38 # define CLANG_CUDA_PTX_WORKAROUND \
39  (BOOST_COMP_CLANG && BOOST_LANG_CUDA && BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(6, 0, 0))
40 
41 //! These types must be in the global namespace for checking existence of respective functions in global namespace via
42 //! SFINAE, so we use inline namespace.
43 inline namespace alpakaGlobal
44 {
45  //! Provide an interface to builtin atomic functions.
46  //
47  // To check for the existence of builtin functions located in the global namespace :: directly.
48  // This would not be possible without having these types in global namespace.
49  // If the functor is inheriting from std::false_type an signature is explicitly not available. This can be used to
50  // explicitly disable builtin function in case the builtin is broken.
51  // If the functor is inheriting from std::true_type a specialization must implement one of the following
52  // interfaces.
53  // \code{.cpp}
54  // // interface for all atomics except atomicCas
55  // __device__ static T atomic( T* add, T value);
56  // // interface for atomicCas only
57  // __device__ static T atomic( T* add, T compare, T value);
58  // \endcode
59  template<typename TOp, typename T, typename THierarchy, typename TSfinae = void>
60  struct AlpakaBuiltInAtomic : std::false_type
61  {
62  };
63 
64  // Cas.
65  template<typename T, typename THierarchy>
67  alpaka::AtomicCas,
68  T,
69  THierarchy,
70  typename std::void_t<
71  decltype(atomicCAS(alpaka::core::declval<T*>(), alpaka::core::declval<T>(), alpaka::core::declval<T>()))>>
72  : std::true_type
73  {
74  static __device__ T atomic(T* add, T compare, T value)
75  {
76  return atomicCAS(add, compare, value);
77  }
78  };
79 
80 # if !CLANG_CUDA_PTX_WORKAROUND
81  template<typename T>
83  alpaka::AtomicCas,
84  T,
86  typename std::void_t<decltype(atomicCAS_block(
87  alpaka::core::declval<T*>(),
88  alpaka::core::declval<T>(),
89  alpaka::core::declval<T>()))>> : std::true_type
90  {
91  static __device__ T atomic(T* add, T compare, T value)
92  {
93  return atomicCAS_block(add, compare, value);
94  }
95  };
96 # endif
97 
98 
99  // Add.
100  template<typename T, typename THierarchy>
102  alpaka::AtomicAdd,
103  T,
104  THierarchy,
105  typename std::void_t<decltype(atomicAdd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
106  : std::true_type
107  {
108  static __device__ T atomic(T* add, T value)
109  {
110  return atomicAdd(add, value);
111  }
112  };
113 
114 
115 # if !CLANG_CUDA_PTX_WORKAROUND
116  template<typename T>
118  alpaka::AtomicAdd,
119  T,
121  typename std::void_t<decltype(atomicAdd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
122  : std::true_type
123  {
124  static __device__ T atomic(T* add, T value)
125  {
126  return atomicAdd_block(add, value);
127  }
128  };
129 # endif
130 
131 # if CLANG_CUDA_PTX_WORKAROUND
132  // clang is providing a builtin for atomicAdd even if these is not supported by the current architecture
133  template<typename THierarchy>
134  struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, double, THierarchy> : std::false_type
135  {
136  };
137 # endif
138 
139 # if(BOOST_LANG_HIP)
140  // HIP shows bad performance with builtin atomicAdd(float*,float) for the hierarchy threads therefore we do not
141  // call the buildin method and instead use the atomicCAS emulation. For details see:
142  // https://github.com/alpaka-group/alpaka/issues/1657
143  template<>
144  struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, float, alpaka::hierarchy::Threads> : std::false_type
145  {
146  };
147 # endif
148 
149  // Sub.
150 
151  template<typename T, typename THierarchy>
153  alpaka::AtomicSub,
154  T,
155  THierarchy,
156  typename std::void_t<decltype(atomicSub(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
157  : std::true_type
158  {
159  static __device__ T atomic(T* add, T value)
160  {
161  return atomicSub(add, value);
162  }
163  };
164 
165 # if !CLANG_CUDA_PTX_WORKAROUND
166  template<typename T>
168  alpaka::AtomicSub,
169  T,
171  typename std::void_t<decltype(atomicSub_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
172  : std::true_type
173  {
174  static __device__ T atomic(T* add, T value)
175  {
176  return atomicSub_block(add, value);
177  }
178  };
179 # endif
180 
181  // Min.
182  template<typename T, typename THierarchy>
184  alpaka::AtomicMin,
185  T,
186  THierarchy,
187  typename std::void_t<decltype(atomicMin(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
188  : std::true_type
189  {
190  static __device__ T atomic(T* add, T value)
191  {
192  return atomicMin(add, value);
193  }
194  };
195 
196 # if !CLANG_CUDA_PTX_WORKAROUND
197  template<typename T>
199  alpaka::AtomicMin,
200  T,
202  typename std::void_t<decltype(atomicMin_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
203  : std::true_type
204  {
205  static __device__ T atomic(T* add, T value)
206  {
207  return atomicMin_block(add, value);
208  }
209  };
210 # endif
211 
212 // disable HIP atomicMin: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
213 # if(BOOST_LANG_HIP)
214  template<typename THierarchy>
215  struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, THierarchy> : std::false_type
216  {
217  };
218 
219  template<>
220  struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, alpaka::hierarchy::Threads> : std::false_type
221  {
222  };
223 
224  template<typename THierarchy>
225  struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, THierarchy> : std::false_type
226  {
227  };
228 
229  template<>
230  struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, alpaka::hierarchy::Threads> : std::false_type
231  {
232  };
233 
234 # if !__has_builtin(__hip_atomic_compare_exchange_strong)
235  template<typename THierarchy>
236  struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, THierarchy> : std::false_type
237  {
238  };
239 
240  template<>
241  struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
242  {
243  };
244 # endif
245 # endif
246 
247  // Max.
248 
249  template<typename T, typename THierarchy>
251  alpaka::AtomicMax,
252  T,
253  THierarchy,
254  typename std::void_t<decltype(atomicMax(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
255  : std::true_type
256  {
257  static __device__ T atomic(T* add, T value)
258  {
259  return atomicMax(add, value);
260  }
261  };
262 
263 # if !CLANG_CUDA_PTX_WORKAROUND
264  template<typename T>
266  alpaka::AtomicMax,
267  T,
269  typename std::void_t<decltype(atomicMax_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
270  : std::true_type
271  {
272  static __device__ T atomic(T* add, T value)
273  {
274  return atomicMax_block(add, value);
275  }
276  };
277 # endif
278 
279  // disable HIP atomicMax: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
280 # if(BOOST_LANG_HIP)
281  template<typename THierarchy>
282  struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, THierarchy> : std::false_type
283  {
284  };
285 
286  template<>
287  struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, alpaka::hierarchy::Threads> : std::false_type
288  {
289  };
290 
291  template<typename THierarchy>
292  struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, THierarchy> : std::false_type
293  {
294  };
295 
296  template<>
297  struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, alpaka::hierarchy::Threads> : std::false_type
298  {
299  };
300 
301 # if !__has_builtin(__hip_atomic_compare_exchange_strong)
302  template<typename THierarchy>
303  struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, THierarchy> : std::false_type
304  {
305  };
306 
307  template<>
308  struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
309  {
310  };
311 # endif
312 # endif
313 
314 
315  // Exch.
316 
317  template<typename T, typename THierarchy>
319  alpaka::AtomicExch,
320  T,
321  THierarchy,
322  typename std::void_t<decltype(atomicExch(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
323  : std::true_type
324  {
325  static __device__ T atomic(T* add, T value)
326  {
327  return atomicExch(add, value);
328  }
329  };
330 
331 # if !CLANG_CUDA_PTX_WORKAROUND
332  template<typename T>
334  alpaka::AtomicExch,
335  T,
337  typename std::void_t<decltype(atomicExch_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
338  : std::true_type
339  {
340  static __device__ T atomic(T* add, T value)
341  {
342  return atomicExch_block(add, value);
343  }
344  };
345 # endif
346 
347  // Inc.
348 
349  template<typename T, typename THierarchy>
351  alpaka::AtomicInc,
352  T,
353  THierarchy,
354  typename std::void_t<decltype(atomicInc(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
355  : std::true_type
356  {
357  static __device__ T atomic(T* add, T value)
358  {
359  return atomicInc(add, value);
360  }
361  };
362 
363 # if !CLANG_CUDA_PTX_WORKAROUND
364  template<typename T>
366  alpaka::AtomicInc,
367  T,
369  typename std::void_t<decltype(atomicInc_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
370  : std::true_type
371  {
372  static __device__ T atomic(T* add, T value)
373  {
374  return atomicInc_block(add, value);
375  }
376  };
377 # endif
378 
379  // Dec.
380 
381  template<typename T, typename THierarchy>
383  alpaka::AtomicDec,
384  T,
385  THierarchy,
386  typename std::void_t<decltype(atomicDec(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
387  : std::true_type
388  {
389  static __device__ T atomic(T* add, T value)
390  {
391  return atomicDec(add, value);
392  }
393  };
394 
395 # if !CLANG_CUDA_PTX_WORKAROUND
396  template<typename T>
398  alpaka::AtomicDec,
399  T,
401  typename std::void_t<decltype(atomicDec_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
402  : std::true_type
403  {
404  static __device__ T atomic(T* add, T value)
405  {
406  return atomicDec_block(add, value);
407  }
408  };
409 # endif
410 
411  // And.
412 
413  template<typename T, typename THierarchy>
415  alpaka::AtomicAnd,
416  T,
417  THierarchy,
418  typename std::void_t<decltype(atomicAnd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
419  : std::true_type
420  {
421  static __device__ T atomic(T* add, T value)
422  {
423  return atomicAnd(add, value);
424  }
425  };
426 
427 # if !CLANG_CUDA_PTX_WORKAROUND
428  template<typename T>
430  alpaka::AtomicAnd,
431  T,
433  typename std::void_t<decltype(atomicAnd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
434  : std::true_type
435  {
436  static __device__ T atomic(T* add, T value)
437  {
438  return atomicAnd_block(add, value);
439  }
440  };
441 # endif
442 
443  // Or.
444 
445  template<typename T, typename THierarchy>
447  alpaka::AtomicOr,
448  T,
449  THierarchy,
450  typename std::void_t<decltype(atomicOr(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
451  : std::true_type
452  {
453  static __device__ T atomic(T* add, T value)
454  {
455  return atomicOr(add, value);
456  }
457  };
458 
459 # if !CLANG_CUDA_PTX_WORKAROUND
460  template<typename T>
462  alpaka::AtomicOr,
463  T,
465  typename std::void_t<decltype(atomicOr_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
466  : std::true_type
467  {
468  static __device__ T atomic(T* add, T value)
469  {
470  return atomicOr_block(add, value);
471  }
472  };
473 # endif
474 
475  // Xor.
476 
477  template<typename T, typename THierarchy>
479  alpaka::AtomicXor,
480  T,
481  THierarchy,
482  typename std::void_t<decltype(atomicXor(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
483  : std::true_type
484  {
485  static __device__ T atomic(T* add, T value)
486  {
487  return atomicXor(add, value);
488  }
489  };
490 
491 # if !CLANG_CUDA_PTX_WORKAROUND
492  template<typename T>
494  alpaka::AtomicXor,
495  T,
497  typename std::void_t<decltype(atomicXor_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
498  : std::true_type
499  {
500  static __device__ T atomic(T* add, T value)
501  {
502  return atomicXor_block(add, value);
503  }
504  };
505 # endif
506 
507 } // namespace alpakaGlobal
508 
509 # undef CLANG_CUDA_PTX_WORKAROUND
510 # endif
511 
512 #endif
The GPU CUDA/HIP accelerator atomic ops.
These types must be in the global namespace for checking existence of respective functions in global ...
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicXor(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic xor operation.
Definition: Traits.hpp:276
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicExch(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic exchange operation.
Definition: Traits.hpp:186
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicSub(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic sub operation.
Definition: Traits.hpp:132
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicMin(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic min operation.
Definition: Traits.hpp:150
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicInc(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic increment operation.
Definition: Traits.hpp:204
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicDec(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic decrement operation.
Definition: Traits.hpp:222
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicMax(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic max operation.
Definition: Traits.hpp:168
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAdd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic add operation.
Definition: Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicOr(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic or operation.
Definition: Traits.hpp:258
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAnd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic and operation.
Definition: Traits.hpp:240
Provide an interface to builtin atomic functions.