alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2023 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
10
11#include <cstdint>
12
13#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
14
15namespace alpaka::warp
16{
17 //! The GPU CUDA/HIP warp.
18 class WarpUniformCudaHipBuiltIn : public interface::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
19 {
20 };
21
22# if !defined(ALPAKA_HOST_ONLY)
23
24# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
25# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
26# endif
27
28# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
29# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
30# endif
31
32 namespace trait
33 {
34 template<>
35 struct GetSize<WarpUniformCudaHipBuiltIn>
36 {
37 __device__ static auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
38 {
39 return warpSize;
40 }
41 };
42
43 template<>
44 struct GetSizeCompileTime<WarpUniformCudaHipBuiltIn>
45 {
46 __device__ static constexpr auto getSizeCompileTime() -> std::int32_t
47 {
48# if defined(__CUDA_ARCH__)
49 // CUDA always has a warp size of 32
50 return 32;
51# elif defined(__HIP_DEVICE_COMPILE__)
52 // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
53# if defined(__GFX9__)
54 // GCN 5.0 and CDNA GPUs have a wavefront size of 64
55 return 64;
56# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
57 // RDNA GPUs have a wavefront size of 32
58 return 32;
59# else
60 // Unknown AMD GPU architecture
61# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
62 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
63# else
64# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
65 return 0;
66# endif
67# endif
68# endif
69 // Host compilation
70 return 0;
71 }
72 };
73
74 template<>
75 struct GetSizeUpperLimit<WarpUniformCudaHipBuiltIn>
76 {
77 __device__ static constexpr auto getSizeUpperLimit() -> std::int32_t
78 {
79# if defined(__CUDA_ARCH__)
80 // CUDA always has a warp size of 32
81 return 32;
82# elif defined(__HIP_DEVICE_COMPILE__)
83 // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
84# if defined(__GFX9__)
85 // GCN 5.0 and CDNA GPUs have a wavefront size of 64
86 return 64;
87# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
88 // RDNA GPUs have a wavefront size of 32
89 return 32;
90# else
91 // Unknown AMD GPU architecture
92# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
93 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
94# else
95# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
96 return 64;
97# endif
98# endif
99# endif
100 // Host compilation
101 return 64;
102 }
103 };
104
105 template<>
106 struct Activemask<WarpUniformCudaHipBuiltIn>
107 {
108 __device__ static auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
109# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
110 -> std::uint32_t
111# else
112 -> std::uint64_t
113# endif
114 {
115# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
116 return __activemask();
117# else
118 // No HIP intrinsic for it, emulate via ballot
119 return __ballot(1);
120# endif
121 }
122 };
123
124 template<>
125 struct All<WarpUniformCudaHipBuiltIn>
126 {
127 __device__ static auto all(
128 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
129 std::int32_t predicate) -> std::int32_t
130 {
131# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
132 return __all_sync(0xffff'ffff, predicate);
133# else
134 return __all(predicate);
135# endif
136 }
137 };
138
139 template<>
140 struct Any<WarpUniformCudaHipBuiltIn>
141 {
142 __device__ static auto any(
143 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
144 std::int32_t predicate) -> std::int32_t
145 {
146# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
147 return __any_sync(0xffff'ffff, predicate);
148# else
149 return __any(predicate);
150# endif
151 }
152 };
153
154 template<>
155 struct Ballot<WarpUniformCudaHipBuiltIn>
156 {
157 __device__ static auto ballot(
158 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
159 std::int32_t predicate)
160 // return type is required by the compiler
161# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
162 -> std::uint32_t
163# else
164 -> std::uint64_t
165# endif
166 {
167# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
168 return __ballot_sync(0xffff'ffff, predicate);
169# else
170 return __ballot(predicate);
171# endif
172 }
173 };
174
175 template<>
176 struct Shfl<WarpUniformCudaHipBuiltIn>
177 {
178 template<typename T>
179 __device__ static auto shfl(
180 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
181 T val,
182 int srcLane,
183 std::int32_t width) -> T
184 {
185# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
186 return __shfl_sync(0xffff'ffff, val, srcLane, width);
187# else
188 return __shfl(val, srcLane, width);
189# endif
190 }
191 };
192
193 template<>
194 struct ShflUp<WarpUniformCudaHipBuiltIn>
195 {
196 template<typename T>
197 __device__ static auto shfl_up(
198 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
199 T val,
200 std::uint32_t offset,
201 std::int32_t width) -> T
202 {
203# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
204 return __shfl_up_sync(0xffff'ffff, val, offset, width);
205# else
206 return __shfl_up(val, offset, width);
207# endif
208 }
209 };
210
211 template<>
212 struct ShflDown<WarpUniformCudaHipBuiltIn>
213 {
214 template<typename T>
215 __device__ static auto shfl_down(
216 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
217 T val,
218 std::uint32_t offset,
219 std::int32_t width) -> T
220 {
221# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
222 return __shfl_down_sync(0xffff'ffff, val, offset, width);
223# else
224 return __shfl_down(val, offset, width);
225# endif
226 }
227 };
228
229 template<>
230 struct ShflXor<WarpUniformCudaHipBuiltIn>
231 {
232 template<typename T>
233 __device__ static auto shfl_xor(
234 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
235 T val,
236 std::int32_t mask,
237 std::int32_t width) -> T
238 {
239# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
240 return __shfl_xor_sync(0xffff'ffff, val, mask, width);
241# else
242 return __shfl_xor(val, mask, width);
243# endif
244 }
245 };
246
247 } // namespace trait
248# endif
249} // namespace alpaka::warp
250
251#endif
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:145
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:195
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:267
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:168
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:305
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:343
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:229
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15