alpaka
Abstraction Library for Parallel Kernel Acceleration
AtomicUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, RenĂ© Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
8 #include "alpaka/atomic/Op.hpp"
11 #include "alpaka/core/Decay.hpp"
13 
14 #include <limits>
15 #include <type_traits>
16 
17 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
18 
19 # if !defined(ALPAKA_HOST_ONLY)
20 
21 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
22 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
23 # endif
24 
25 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
26 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
27 # endif
28 
29 namespace alpaka::trait
30 {
31  namespace detail
32  {
34  {
35  //! reinterprets an address as an 32bit value for atomicCas emulation usage
36  template<typename TAddressType>
37  static __device__ auto reinterpretAddress(TAddressType* address)
38  -> std::enable_if_t<sizeof(TAddressType) == 4u, unsigned int*>
39  {
40  return reinterpret_cast<unsigned int*>(address);
41  }
42 
43  //! reinterprets a address as an 64bit value for atomicCas emulation usage
44  template<typename TAddressType>
45  static __device__ auto reinterpretAddress(TAddressType* address)
46  -> std::enable_if_t<sizeof(TAddressType) == 8u, unsigned long long int*>
47  {
48  return reinterpret_cast<unsigned long long int*>(address);
49  }
50 
51  //! reinterprets a value to be usable for the atomicCAS emulation
52  template<typename T_Type>
53  static __device__ auto reinterpretValue(T_Type value)
54  {
55  return *reinterpretAddress(&value);
56  }
57  };
58 
59  //! Emulate atomic
60  //
61  // The default implementation will emulate all atomic functions with atomicCAS.
62  template<
63  typename TOp,
64  typename TAtomic,
65  typename T,
66  typename THierarchy,
67  typename TSfinae = void,
68  typename TDefer = void>
69  struct EmulateAtomic : private EmulationBase
70  {
71  public:
72  static __device__ auto atomic(
74  T* const addr,
75  T const& value) -> T
76  {
77  auto* const addressAsIntegralType = reinterpretAddress(addr);
78  using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
79 
80  // Emulating atomics with atomicCAS is mentioned in the programming guide too.
81  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
82 # if BOOST_LANG_HIP
83 # if __has_builtin(__hip_atomic_load)
84  EmulatedType old{__hip_atomic_load(addressAsIntegralType, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)};
85 # else
86  EmulatedType old{__atomic_load_n(addressAsIntegralType, __ATOMIC_RELAXED)};
87 # endif
88 # else
89  EmulatedType old{*addressAsIntegralType};
90 # endif
91  EmulatedType assumed;
92  do
93  {
94  assumed = old;
95  T v = *(reinterpret_cast<T*>(&assumed));
96  TOp{}(&v, value);
97  using Cas = alpaka::trait::
98  AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>;
99  old = Cas::atomicOp(ctx, addressAsIntegralType, assumed, reinterpretValue(v));
100  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
101  } while(assumed != old);
102  return *(reinterpret_cast<T*>(&old));
103  }
104  };
105 
106  //! Emulate AtomicCas with equivalent unisigned integral type
107  template<typename T, typename THierarchy>
109  : private EmulationBase
110  {
111  static __device__ auto atomic(
113  T* const addr,
114  T const& compare,
115  T const& value) -> T
116  {
117  auto* const addressAsIntegralType = reinterpretAddress(addr);
118  using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
119  EmulatedType reinterpretedCompare = reinterpretValue(compare);
120  EmulatedType reinterpretedValue = reinterpretValue(value);
121 
122  auto old = alpaka::trait::
124  atomicOp(ctx, addressAsIntegralType, reinterpretedCompare, reinterpretedValue);
125 
126  return *(reinterpret_cast<T*>(&old));
127  }
128  };
129 
130  //! Emulate AtomicSub with atomicAdd
131  template<typename T, typename THierarchy>
133  {
134  static __device__ auto atomic(
136  T* const addr,
137  T const& value) -> T
138  {
140  atomicOp(ctx, addr, -value);
141  }
142  };
143 
144  //! AtomicDec can not be implemented for floating point types!
145  template<typename T, typename THierarchy>
147  alpaka::AtomicDec,
149  T,
150  THierarchy,
151  std::enable_if_t<std::is_floating_point_v<T>>>
152  {
153  static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
154  {
155  static_assert(
156  !sizeof(T),
157  "EmulateAtomic<alpaka::AtomicDec> is not supported for floating point data types!");
158  return T{};
159  }
160  };
161 
162  //! AtomicInc can not be implemented for floating point types!
163  template<typename T, typename THierarchy>
165  alpaka::AtomicInc,
167  T,
168  THierarchy,
169  std::enable_if_t<std::is_floating_point_v<T>>>
170  {
171  static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
172  {
173  static_assert(
174  !sizeof(T),
175  "EmulateAtomic<alpaka::AtomicInc> is not supported for floating point data types!");
176  return T{};
177  }
178  };
179 
180  //! AtomicAnd can not be implemented for floating point types!
181  template<typename T, typename THierarchy>
183  alpaka::AtomicAnd,
185  T,
186  THierarchy,
187  std::enable_if_t<std::is_floating_point_v<T>>>
188  {
189  static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
190  {
191  static_assert(
192  !sizeof(T),
193  "EmulateAtomic<alpaka::AtomicAnd> is not supported for floating point data types!");
194  return T{};
195  }
196  };
197 
198  //! AtomicOr can not be implemented for floating point types!
199  template<typename T, typename THierarchy>
201  alpaka::AtomicOr,
203  T,
204  THierarchy,
205  std::enable_if_t<std::is_floating_point_v<T>>>
206  {
207  static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
208  {
209  static_assert(
210  !sizeof(T),
211  "EmulateAtomic<alpaka::AtomicOr> is not supported for floating point data types!");
212  return T{};
213  }
214  };
215 
216  //! AtomicXor can not be implemented for floating point types!
217  template<typename T, typename THierarchy>
219  alpaka::AtomicXor,
221  T,
222  THierarchy,
223  std::enable_if_t<std::is_floating_point_v<T>>>
224  {
225  static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
226  {
227  static_assert(
228  !sizeof(T),
229  "EmulateAtomic<alpaka::AtomicXor> is not supported for floating point data types!");
230  return T{};
231  }
232  };
233 
234  } // namespace detail
235 
236  //! Generic atomic implementation
237  //
238  // - unsigned long int will be redirected to unsigned long long int or unsigned int implementation depending if
239  // unsigned long int is a 64 or 32bit data type.
240  // - Atomics which are not available as builtin atomic will be emulated.
241  template<typename TOp, typename T, typename THierarchy>
242  struct AtomicOp<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>
243  {
244  static __device__ auto atomicOp(
245  AtomicUniformCudaHipBuiltIn const& ctx,
246  [[maybe_unused]] T* const addr,
247  [[maybe_unused]] T const& value) -> T
248  {
249  static_assert(
250  sizeof(T) == 4u || sizeof(T) == 8u,
251  "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, value) is not supported! Only 64 and "
252  "32bit atomics are supported.");
253 
255  return ::AlpakaBuiltInAtomic<TOp, T, THierarchy>::atomic(addr, value);
256 
257  else if constexpr(std::is_same_v<unsigned long int, T>)
258  {
259  if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::value)
260  return ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::atomic(
261  reinterpret_cast<unsigned int*>(addr),
262  static_cast<unsigned int>(value));
263  else if constexpr(
265  {
266  return ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::atomic(
267  reinterpret_cast<unsigned long long int*>(addr),
268  static_cast<unsigned long long int>(value));
269  }
270  }
271 
273  }
274  };
275 
276  template<typename T, typename THierarchy>
278  {
279  static __device__ auto atomicOp(
280  [[maybe_unused]] AtomicUniformCudaHipBuiltIn const& ctx,
281  [[maybe_unused]] T* const addr,
282  [[maybe_unused]] T const& compare,
283  [[maybe_unused]] T const& value) -> T
284  {
285  static_assert(
286  sizeof(T) == 4u || sizeof(T) == 8u,
287  "atomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, compare, value) is not "
288  "supported! Only 64 and "
289  "32bit atomics are supported.");
290 
292  return ::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::atomic(addr, compare, value);
293 
294  else if constexpr(std::is_same_v<unsigned long int, T>)
295  {
296  if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::value)
297  return ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::atomic(
298  reinterpret_cast<unsigned int*>(addr),
299  static_cast<unsigned int>(compare),
300  static_cast<unsigned int>(value));
301  else if constexpr(
302  sizeof(T) == 8u
304  {
305  return ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::atomic(
306  reinterpret_cast<unsigned long long int*>(addr),
307  static_cast<unsigned long long int>(compare),
308  static_cast<unsigned long long int>(value));
309  }
310  }
311 
313  ctx,
314  addr,
315  compare,
316  value);
317  }
318  };
319 } // namespace alpaka::trait
320 # endif
321 #endif
The GPU CUDA/HIP accelerator atomic ops.
The accelerator traits.
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicOp(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &=THierarchy()) -> T
Executes the given operation atomically.
Definition: Traits.hpp:73
Provide an interface to builtin atomic functions.
The and function object.
Definition: Op.hpp:140
The compare and swap function object.
Definition: Op.hpp:185
The decrement function object.
Definition: Op.hpp:123
The increment function object.
Definition: Op.hpp:106
The or function object.
Definition: Op.hpp:155
The subtraction function object.
Definition: Op.hpp:39
The exclusive or function object.
Definition: Op.hpp:170
static __device__ auto atomicOp([[maybe_unused]] AtomicUniformCudaHipBuiltIn const &ctx, [[maybe_unused]] T *const addr, [[maybe_unused]] T const &compare, [[maybe_unused]] T const &value) -> T
static __device__ auto atomicOp(AtomicUniformCudaHipBuiltIn const &ctx, [[maybe_unused]] T *const addr, [[maybe_unused]] T const &value) -> T
The atomic operation trait.
Definition: Traits.hpp:60
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &compare, T const &value) -> T
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
static __device__ auto reinterpretValue(T_Type value)
reinterprets a value to be usable for the atomicCAS emulation
static __device__ auto reinterpretAddress(TAddressType *address) -> std::enable_if_t< sizeof(TAddressType)==8u, unsigned long long int * >
reinterprets a address as an 64bit value for atomicCas emulation usage
static __device__ auto reinterpretAddress(TAddressType *address) -> std::enable_if_t< sizeof(TAddressType)==4u, unsigned int * >
reinterprets an address as an 32bit value for atomicCas emulation usage