alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2023 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
10
11#include <cstdint>
12
13#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
14
15namespace alpaka::warp
16{
17 //! The GPU CUDA/HIP warp.
18 class WarpUniformCudaHipBuiltIn : public interface::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
19 {
20 };
21
22# if !defined(ALPAKA_HOST_ONLY)
23
24# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
25# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
26# endif
27
28# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
29# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
30# endif
31
32 namespace trait
33 {
34 template<>
35 struct GetSize<WarpUniformCudaHipBuiltIn>
36 {
37 static __device__ auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
38 {
39 return warpSize;
40 }
41 };
42
43 template<>
44 struct GetSizeCompileTime<WarpUniformCudaHipBuiltIn>
45 {
46 static constexpr __device__ auto getSizeCompileTime() -> std::int32_t
47 {
48# if defined(__CUDA_ARCH__)
49 // CUDA always has a warp size of 32
50 return 32;
51# elif defined(__HIP_DEVICE_COMPILE__)
52 // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
53# if defined(__GFX9__)
54 // GCN 5.0 and CDNA GPUs have a wavefront size of 64
55 return 64;
56# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
57 // RDNA GPUs have a wavefront size of 32
58 return 32;
59# else
60 // Unknown AMD GPU architecture
61# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
62 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
63# else
64# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
65 return 0;
66# endif
67# endif
68# endif
69 // Host compilation
70 return 0;
71 }
72 };
73
74 template<>
75 struct GetSizeUpperLimit<WarpUniformCudaHipBuiltIn>
76 {
77 static constexpr __device__ auto getSizeUpperLimit() -> std::int32_t
78 {
79# if defined(__CUDA_ARCH__)
80 // CUDA always has a warp size of 32
81 return 32;
82# elif defined(__HIP_DEVICE_COMPILE__)
83 // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
84# if defined(__GFX9__)
85 // GCN 5.0 and CDNA GPUs have a wavefront size of 64
86 return 64;
87# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
88 // RDNA GPUs have a wavefront size of 32
89 return 32;
90# else
91 // Unknown AMD GPU architecture
92# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
93 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
94# else
95# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
96 return 64;
97# endif
98# endif
99# endif
100 // Host compilation
101 return 64;
102 }
103 };
104
105 template<>
106 struct Activemask<WarpUniformCudaHipBuiltIn>
107 {
108 static __device__ auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
109# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
110 -> std::uint32_t
111# else
112 -> std::uint64_t
113# endif
114 {
115# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
116 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
117 return __activemask();
118# else
119 // No HIP intrinsic for it, emulate via ballot
120 return __ballot(1);
121# endif
122 }
123 };
124
125 template<>
126 struct All<WarpUniformCudaHipBuiltIn>
127 {
128 static __device__ auto all(
129 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
130 std::int32_t predicate) -> std::int32_t
131 {
132# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
133 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
134 return __all_sync(activemask(warp), predicate);
135# else
136 return __all(predicate);
137# endif
138 }
139 };
140
141 template<>
142 struct Any<WarpUniformCudaHipBuiltIn>
143 {
144 static __device__ auto any(
145 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
146 std::int32_t predicate) -> std::int32_t
147 {
148# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
149 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
150 return __any_sync(activemask(warp), predicate);
151# else
152 return __any(predicate);
153# endif
154 }
155 };
156
157 template<>
158 struct Ballot<WarpUniformCudaHipBuiltIn>
159 {
160 static __device__ auto ballot(
161 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
162 std::int32_t predicate)
163 // return type is required by the compiler
164# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
165 -> std::uint32_t
166# else
167 -> std::uint64_t
168# endif
169 {
170# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
171 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
172 return __ballot_sync(activemask(warp), predicate);
173# else
174 return __ballot(predicate);
175# endif
176 }
177 };
178
179 template<>
180 struct Shfl<WarpUniformCudaHipBuiltIn>
181 {
182 template<typename T>
183 static __device__ auto shfl(
184 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
185 T val,
186 int srcLane,
187 std::int32_t width) -> T
188 {
189# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
190 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
191 return __shfl_sync(activemask(warp), val, srcLane, width);
192# else
193 return __shfl(val, srcLane, width);
194# endif
195 }
196 };
197
198 template<>
199 struct ShflUp<WarpUniformCudaHipBuiltIn>
200 {
201 template<typename T>
202 static __device__ auto shfl_up(
203 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
204 T val,
205 std::uint32_t offset,
206 std::int32_t width) -> T
207 {
208# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
209 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
210 return __shfl_up_sync(activemask(warp), val, offset, width);
211# else
212 return __shfl_up(val, offset, width);
213# endif
214 }
215 };
216
217 template<>
218 struct ShflDown<WarpUniformCudaHipBuiltIn>
219 {
220 template<typename T>
221 static __device__ auto shfl_down(
222 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
223 T val,
224 std::uint32_t offset,
225 std::int32_t width) -> T
226 {
227# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
228 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
229 return __shfl_down_sync(activemask(warp), val, offset, width);
230# else
231 return __shfl_down(val, offset, width);
232# endif
233 }
234 };
235
236 template<>
237 struct ShflXor<WarpUniformCudaHipBuiltIn>
238 {
239 template<typename T>
240 static __device__ auto shfl_xor(
241 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
242 T val,
243 std::int32_t mask,
244 std::int32_t width) -> T
245 {
246# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
247 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
248 return __shfl_xor_sync(activemask(warp), val, mask, width);
249# else
250 return __shfl_xor(val, mask, width);
251# endif
252 }
253 };
254
255 } // namespace trait
256# endif
257} // namespace alpaka::warp
258
259#endif
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:145
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:195
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:267
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:168
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:305
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:343
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:229
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15