alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
AtomicUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/core/Decay.hpp"
13
14#include <limits>
15#include <type_traits>
16
17#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
18
19# if !defined(ALPAKA_HOST_ONLY)
20
21# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
22# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
23# endif
24
25# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
26# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
27# endif
28
29namespace alpaka::trait
30{
31 namespace detail
32 {
34 {
35 //! reinterprets an address as an 32bit value for atomicCas emulation usage
36 template<typename TAddressType>
37 static __device__ auto reinterpretAddress(TAddressType* address)
38 -> std::enable_if_t<sizeof(TAddressType) == 4u, unsigned int*>
39 {
40 return reinterpret_cast<unsigned int*>(address);
41 }
42
43 //! reinterprets a address as an 64bit value for atomicCas emulation usage
44 template<typename TAddressType>
45 static __device__ auto reinterpretAddress(TAddressType* address)
46 -> std::enable_if_t<sizeof(TAddressType) == 8u, unsigned long long int*>
47 {
48 return reinterpret_cast<unsigned long long int*>(address);
49 }
50
51 //! reinterprets a value to be usable for the atomicCAS emulation
52 template<typename T_Type>
53 static __device__ auto reinterpretValue(T_Type value)
54 {
55 return *reinterpretAddress(&value);
56 }
57 };
58
59 //! Emulate atomic
60 //
61 // The default implementation will emulate all atomic functions with atomicCAS.
62 template<
63 typename TOp,
64 typename TAtomic,
65 typename T,
66 typename THierarchy,
67 typename TSfinae = void,
68 typename TDefer = void>
70 {
71 public:
72 static __device__ auto atomic(
74 T* const addr,
75 T const& value) -> T
76 {
77 auto* const addressAsIntegralType = reinterpretAddress(addr);
78 using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
79
80 // Emulating atomics with atomicCAS is mentioned in the programming guide too.
81 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
82# if BOOST_LANG_HIP
83# if __has_builtin(__hip_atomic_load)
84 EmulatedType old{__hip_atomic_load(addressAsIntegralType, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)};
85# else
86 EmulatedType old{__atomic_load_n(addressAsIntegralType, __ATOMIC_RELAXED)};
87# endif
88# else
89 EmulatedType old{*addressAsIntegralType};
90# endif
91 EmulatedType assumed;
92 do
93 {
94 assumed = old;
95 T v = *(reinterpret_cast<T*>(&assumed));
96 TOp{}(&v, value);
97 using Cas = alpaka::trait::
98 AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>;
99 old = Cas::atomicOp(ctx, addressAsIntegralType, assumed, reinterpretValue(v));
100 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
101 } while(assumed != old);
102 return *(reinterpret_cast<T*>(&old));
103 }
104 };
105
106 //! Emulate AtomicCas with equivalent unisigned integral type
107 template<typename T, typename THierarchy>
109 : private EmulationBase
110 {
111 static __device__ auto atomic(
113 T* const addr,
114 T const& compare,
115 T const& value) -> T
116 {
117 auto* const addressAsIntegralType = reinterpretAddress(addr);
118 using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
119 EmulatedType reinterpretedCompare = reinterpretValue(compare);
120 EmulatedType reinterpretedValue = reinterpretValue(value);
121
122 auto old = alpaka::trait::
123 AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>::
124 atomicOp(ctx, addressAsIntegralType, reinterpretedCompare, reinterpretedValue);
125
126 return *(reinterpret_cast<T*>(&old));
127 }
128 };
129
130 //! Emulate AtomicSub with atomicAdd
131 template<typename T, typename THierarchy>
133 {
134 static __device__ auto atomic(
136 T* const addr,
137 T const& value) -> T
138 {
140 atomicOp(ctx, addr, -value);
141 }
142 };
143
144 //! AtomicDec can not be implemented for floating point types!
145 template<typename T, typename THierarchy>
149 T,
150 THierarchy,
151 std::enable_if_t<std::is_floating_point_v<T>>>
152 {
153 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
154 {
155 static_assert(
156 !sizeof(T),
157 "EmulateAtomic<alpaka::AtomicDec> is not supported for floating point data types!");
158 return T{};
159 }
160 };
161
162 //! AtomicInc can not be implemented for floating point types!
163 template<typename T, typename THierarchy>
167 T,
168 THierarchy,
169 std::enable_if_t<std::is_floating_point_v<T>>>
170 {
171 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
172 {
173 static_assert(
174 !sizeof(T),
175 "EmulateAtomic<alpaka::AtomicInc> is not supported for floating point data types!");
176 return T{};
177 }
178 };
179
180 //! AtomicAnd can not be implemented for floating point types!
181 template<typename T, typename THierarchy>
185 T,
186 THierarchy,
187 std::enable_if_t<std::is_floating_point_v<T>>>
188 {
189 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
190 {
191 static_assert(
192 !sizeof(T),
193 "EmulateAtomic<alpaka::AtomicAnd> is not supported for floating point data types!");
194 return T{};
195 }
196 };
197
198 //! AtomicOr can not be implemented for floating point types!
199 template<typename T, typename THierarchy>
203 T,
204 THierarchy,
205 std::enable_if_t<std::is_floating_point_v<T>>>
206 {
207 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
208 {
209 static_assert(
210 !sizeof(T),
211 "EmulateAtomic<alpaka::AtomicOr> is not supported for floating point data types!");
212 return T{};
213 }
214 };
215
216 //! AtomicXor can not be implemented for floating point types!
217 template<typename T, typename THierarchy>
221 T,
222 THierarchy,
223 std::enable_if_t<std::is_floating_point_v<T>>>
224 {
225 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
226 {
227 static_assert(
228 !sizeof(T),
229 "EmulateAtomic<alpaka::AtomicXor> is not supported for floating point data types!");
230 return T{};
231 }
232 };
233
234 } // namespace detail
235
236 //! Generic atomic implementation
237 //
238 // - unsigned long int will be redirected to unsigned long long int or unsigned int implementation depending if
239 // unsigned long int is a 64 or 32bit data type.
240 // - Atomics which are not available as builtin atomic will be emulated.
241 template<typename TOp, typename T, typename THierarchy>
242 struct AtomicOp<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>
243 {
244 static __device__ auto atomicOp(
246 [[maybe_unused]] T* const addr,
247 [[maybe_unused]] T const& value) -> T
248 {
249 static_assert(
250 sizeof(T) == 4u || sizeof(T) == 8u,
251 "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, value) is not supported! Only 64 and "
252 "32bit atomics are supported.");
253
254 if constexpr(::AlpakaBuiltInAtomic<TOp, T, THierarchy>::value)
255 return ::AlpakaBuiltInAtomic<TOp, T, THierarchy>::atomic(addr, value);
256
257 else if constexpr(std::is_same_v<unsigned long int, T>)
258 {
259 if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::value)
260 return ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::atomic(
261 reinterpret_cast<unsigned int*>(addr),
262 static_cast<unsigned int>(value));
263 else if constexpr(
264 sizeof(T) == 8u && ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::value) // LP64
265 {
266 return ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::atomic(
267 reinterpret_cast<unsigned long long int*>(addr),
268 static_cast<unsigned long long int>(value));
269 }
270 }
271
273 }
274 };
275
276 template<typename T, typename THierarchy>
278 {
279 static __device__ auto atomicOp(
280 [[maybe_unused]] AtomicUniformCudaHipBuiltIn const& ctx,
281 [[maybe_unused]] T* const addr,
282 [[maybe_unused]] T const& compare,
283 [[maybe_unused]] T const& value) -> T
284 {
285 static_assert(
286 sizeof(T) == 4u || sizeof(T) == 8u,
287 "atomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, compare, value) is not "
288 "supported! Only 64 and "
289 "32bit atomics are supported.");
290
291 if constexpr(::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::value)
292 return ::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::atomic(addr, compare, value);
293
294 else if constexpr(std::is_same_v<unsigned long int, T>)
295 {
296 if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::value)
297 return ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::atomic(
298 reinterpret_cast<unsigned int*>(addr),
299 static_cast<unsigned int>(compare),
300 static_cast<unsigned int>(value));
301 else if constexpr(
302 sizeof(T) == 8u
303 && ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::value) // LP64
304 {
305 return ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::atomic(
306 reinterpret_cast<unsigned long long int*>(addr),
307 static_cast<unsigned long long int>(compare),
308 static_cast<unsigned long long int>(value));
309 }
310 }
311
313 ctx,
314 addr,
315 compare,
316 value);
317 }
318 };
319} // namespace alpaka::trait
320# endif
321#endif
The GPU CUDA/HIP accelerator atomic ops.
The accelerator traits.
The alpaka accelerator library.
The and function object.
Definition Op.hpp:140
The compare and swap function object.
Definition Op.hpp:185
The decrement function object.
Definition Op.hpp:123
The increment function object.
Definition Op.hpp:106
The or function object.
Definition Op.hpp:155
The subtraction function object.
Definition Op.hpp:39
The exclusive or function object.
Definition Op.hpp:170
static __device__ auto atomicOp(AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &compare, T const &value) -> T
static __device__ auto atomicOp(AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
The atomic operation trait.
Definition Traits.hpp:60
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &compare, T const &value) -> T
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
static __device__ auto reinterpretValue(T_Type value)
reinterprets a value to be usable for the atomicCAS emulation
static __device__ auto reinterpretAddress(TAddressType *address) -> std::enable_if_t< sizeof(TAddressType)==8u, unsigned long long int * >
reinterprets a address as an 64bit value for atomicCas emulation usage
static __device__ auto reinterpretAddress(TAddressType *address) -> std::enable_if_t< sizeof(TAddressType)==4u, unsigned int * >
reinterprets an address as an 32bit value for atomicCas emulation usage