alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
AtomicUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/core/Decay.hpp"
13
14#include <bit>
15#include <limits>
16#include <type_traits>
17
18#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
19
20# if !defined(ALPAKA_HOST_ONLY)
21
22# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
23# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
24# endif
25
26# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
27# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
28# endif
29
30namespace alpaka::trait
31{
32 namespace detail
33 {
35 {
36 template<typename T>
37 using AtomicCasType = std::conditional_t<
38 sizeof(T) == 4u,
39 unsigned int,
40 std::conditional_t<sizeof(T) == 8u, unsigned long long int, void>>;
41
42 template<typename T>
43 static __device__ auto reinterpretAddress(T* address)
44 -> AtomicCasType<T>* requires(sizeof(T) == 4u || sizeof(T) == 8u) {
45 return reinterpret_cast<AtomicCasType<T>*>(address);
46 }
47
48 template<typename T>
49 static __device__ auto reinterpretValue(T value)
50 -> AtomicCasType<T> requires(sizeof(T) == 4u || sizeof(T) == 8u)
51 {
52 return std::bit_cast<AtomicCasType<T>>(value);
53 }
54 };
55
56 //! Emulate atomic
57 //
58 // The default implementation will emulate all atomic functions with atomicCAS.
59 template<
60 typename TOp,
61 typename TAtomic,
62 typename T,
63 typename THierarchy,
64 typename TSfinae = void,
65 typename TDefer = void>
67 {
68 public:
69 static __device__ auto atomic(
71 T* const addr,
72 T const& value) -> T
73 {
74 auto* const addressAsIntegralType = reinterpretAddress(addr);
75 using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
76
77// Emulating atomics with atomicCAS is mentioned in the programming guide too.
78// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
79# if ALPAKA_LANG_HIP
80# if __has_builtin(__hip_atomic_load)
81 EmulatedType old{__hip_atomic_load(addressAsIntegralType, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)};
82# else
83 EmulatedType old{__atomic_load_n(addressAsIntegralType, __ATOMIC_RELAXED)};
84# endif
85# else
86 EmulatedType old{*addressAsIntegralType};
87# endif
88 EmulatedType assumed;
89 do
90 {
91 assumed = old;
92 T v = std::bit_cast<T>(assumed);
93 TOp{}(&v, value);
94 using Cas = alpaka::trait::
95 AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>;
96 old = Cas::atomicOp(ctx, addressAsIntegralType, assumed, reinterpretValue(v));
97 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
98 } while(assumed != old);
99 return std::bit_cast<T>(old);
100 }
101 };
102
103 //! Emulate AtomicCas with equivalent unisigned integral type
104 template<typename T, typename THierarchy>
106 : private EmulationBase
107 {
108 static __device__ auto atomic(
110 T* const addr,
111 T const& compare,
112 T const& value) -> T
113 {
114 auto* const addressAsIntegralType = reinterpretAddress(addr);
115 using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
116 EmulatedType reinterpretedCompare = reinterpretValue(compare);
117 EmulatedType reinterpretedValue = reinterpretValue(value);
118
119 auto old = alpaka::trait::
120 AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>::
121 atomicOp(ctx, addressAsIntegralType, reinterpretedCompare, reinterpretedValue);
122
123 return std::bit_cast<T>(old);
124 }
125 };
126
127 //! Emulate AtomicSub with atomicAdd
128 template<typename T, typename THierarchy>
130 {
131 static __device__ auto atomic(
133 T* const addr,
134 T const& value) -> T
135 {
137 atomicOp(ctx, addr, -value);
138 }
139 };
140
141 //! AtomicDec can not be implemented for floating point types!
142 template<typename T, typename THierarchy>
146 T,
147 THierarchy,
148 std::enable_if_t<std::is_floating_point_v<T>>>
149 {
150 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
151 {
152 static_assert(
153 !sizeof(T),
154 "EmulateAtomic<alpaka::AtomicDec> is not supported for floating point data types!");
155 return T{};
156 }
157 };
158
159 //! AtomicInc can not be implemented for floating point types!
160 template<typename T, typename THierarchy>
164 T,
165 THierarchy,
166 std::enable_if_t<std::is_floating_point_v<T>>>
167 {
168 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
169 {
170 static_assert(
171 !sizeof(T),
172 "EmulateAtomic<alpaka::AtomicInc> is not supported for floating point data types!");
173 return T{};
174 }
175 };
176
177 //! AtomicAnd can not be implemented for floating point types!
178 template<typename T, typename THierarchy>
182 T,
183 THierarchy,
184 std::enable_if_t<std::is_floating_point_v<T>>>
185 {
186 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
187 {
188 static_assert(
189 !sizeof(T),
190 "EmulateAtomic<alpaka::AtomicAnd> is not supported for floating point data types!");
191 return T{};
192 }
193 };
194
195 //! AtomicOr can not be implemented for floating point types!
196 template<typename T, typename THierarchy>
200 T,
201 THierarchy,
202 std::enable_if_t<std::is_floating_point_v<T>>>
203 {
204 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
205 {
206 static_assert(
207 !sizeof(T),
208 "EmulateAtomic<alpaka::AtomicOr> is not supported for floating point data types!");
209 return T{};
210 }
211 };
212
213 //! AtomicXor can not be implemented for floating point types!
214 template<typename T, typename THierarchy>
218 T,
219 THierarchy,
220 std::enable_if_t<std::is_floating_point_v<T>>>
221 {
222 static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
223 {
224 static_assert(
225 !sizeof(T),
226 "EmulateAtomic<alpaka::AtomicXor> is not supported for floating point data types!");
227 return T{};
228 }
229 };
230
231 } // namespace detail
232
233 //! Generic atomic implementation
234 //
235 // - unsigned long int will be redirected to unsigned long long int or unsigned int implementation depending if
236 // unsigned long int is a 64 or 32bit data type.
237 // - Atomics which are not available as builtin atomic will be emulated.
238 template<typename TOp, typename T, typename THierarchy>
239 struct AtomicOp<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>
240 {
241 static __device__ auto atomicOp(
243 [[maybe_unused]] T* const addr,
244 [[maybe_unused]] T const& value) -> T
245 {
246 static_assert(
247 sizeof(T) == 4u || sizeof(T) == 8u,
248 "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, value) is not supported! Only 64 and "
249 "32bit atomics are supported.");
250
251 if constexpr(::AlpakaBuiltInAtomic<TOp, T, THierarchy>::value)
252 return ::AlpakaBuiltInAtomic<TOp, T, THierarchy>::atomic(addr, value);
253
254 else if constexpr(std::is_same_v<unsigned long int, T>)
255 {
256 if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::value)
257 return ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::atomic(
258 reinterpret_cast<unsigned int*>(addr),
259 static_cast<unsigned int>(value));
260 else if constexpr(
261 sizeof(T) == 8u && ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::value) // LP64
262 {
263 return ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::atomic(
264 reinterpret_cast<unsigned long long int*>(addr),
265 static_cast<unsigned long long int>(value));
266 }
267 }
268
270 }
271 };
272
273 template<typename T, typename THierarchy>
275 {
276 static __device__ auto atomicOp(
277 [[maybe_unused]] AtomicUniformCudaHipBuiltIn const& ctx,
278 [[maybe_unused]] T* const addr,
279 [[maybe_unused]] T const& compare,
280 [[maybe_unused]] T const& value) -> T
281 {
282 static_assert(
283 sizeof(T) == 4u || sizeof(T) == 8u,
284 "atomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, compare, value) is not "
285 "supported! Only 64 and "
286 "32bit atomics are supported.");
287
288 if constexpr(::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::value)
289 return ::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::atomic(addr, compare, value);
290
291 else if constexpr(std::is_same_v<unsigned long int, T>)
292 {
293 if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::value)
294 return ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::atomic(
295 reinterpret_cast<unsigned int*>(addr),
296 static_cast<unsigned int>(compare),
297 static_cast<unsigned int>(value));
298 else if constexpr(
299 sizeof(T) == 8u
300 && ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::value) // LP64
301 {
302 return ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::atomic(
303 reinterpret_cast<unsigned long long int*>(addr),
304 static_cast<unsigned long long int>(compare),
305 static_cast<unsigned long long int>(value));
306 }
307 }
308
310 ctx,
311 addr,
312 compare,
313 value);
314 }
315 };
316} // namespace alpaka::trait
317# endif
318#endif
The GPU CUDA/HIP accelerator atomic ops.
The accelerator traits.
The alpaka accelerator library.
The and function object.
Definition Op.hpp:140
The compare and swap function object.
Definition Op.hpp:185
The decrement function object.
Definition Op.hpp:123
The increment function object.
Definition Op.hpp:106
The or function object.
Definition Op.hpp:155
The subtraction function object.
Definition Op.hpp:39
The exclusive or function object.
Definition Op.hpp:170
static __device__ auto atomicOp(AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &compare, T const &value) -> T
static __device__ auto atomicOp(AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
The atomic operation trait.
Definition Traits.hpp:60
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &compare, T const &value) -> T
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const &ctx, T *const addr, T const &value) -> T
std::conditional_t< sizeof(T)==4u, unsigned int, std::conditional_t< sizeof(T)==8u, unsigned long long int, void > > AtomicCasType
static __device__ auto reinterpretValue(T value) -> AtomicCasType< T > requires(sizeof(T)==4u||sizeof(T)==8u)
static __device__ auto reinterpretAddress(T *address) -> AtomicCasType< T > *requires(sizeof(T)==4u||sizeof(T)==8u)