alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2026 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego,
2 * Simone Balducci
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
11
12#include <cstdint>
13
14#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
15
16namespace alpaka::warp
17{
18 //! The GPU CUDA/HIP warp.
19 struct WarpUniformCudaHipBuiltIn : public interface::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
20 {
21# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
22 using mask_type = std::uint32_t;
23# else
24 using mask_type = std::uint64_t;
25# endif
26 };
27
28# if !defined(ALPAKA_HOST_ONLY)
29
30# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
31# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
32# endif
33
34# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
35# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
36# endif
37
38 namespace trait
39 {
40
41 template<>
43 {
44 static __device__ auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
45 {
46 return warpSize;
47 }
48 };
49
50 template<>
52 {
53 static constexpr __device__ auto getSizeCompileTime() -> std::int32_t
54 {
55# if defined(__CUDA_ARCH__)
56 // CUDA always has a warp size of 32
57 return 32;
58# elif defined(__HIP_DEVICE_COMPILE__)
59 // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
60# if defined(__GFX9__)
61 // GCN 5.0 and CDNA GPUs have a wavefront size of 64
62 return 64;
63# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
64 // RDNA GPUs have a wavefront size of 32
65 return 32;
66# else
67 // Unknown AMD GPU architecture
68# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
69 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
70# else
71# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
72 return 0;
73# endif
74# endif
75# endif
76 // Host compilation
77 return 0;
78 }
79 };
80
81 template<>
83 {
84 static constexpr __device__ auto getSizeUpperLimit() -> std::int32_t
85 {
86# if defined(__CUDA_ARCH__)
87 // CUDA always has a warp size of 32
88 return 32;
89# elif defined(__HIP_DEVICE_COMPILE__)
90 // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
91# if defined(__GFX9__)
92 // GCN 5.0 and CDNA GPUs have a wavefront size of 64
93 return 64;
94# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
95 // RDNA GPUs have a wavefront size of 32
96 return 32;
97# else
98 // Unknown AMD GPU architecture
99# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
100 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
101# else
102# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
103 return 64;
104# endif
105# endif
106# endif
107 // Host compilation
108 return 64;
109 }
110 };
111
112 template<>
114 {
115 static __device__ auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
117 {
118# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
119 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
120 return __activemask();
121# else
122 // No HIP intrinsic for it, emulate via ballot
123 return __ballot(1);
124# endif
125 }
126 };
127
128 template<>
130 {
131 static __device__ auto all(
132 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
133 std::int32_t predicate) -> std::int32_t
134 {
135# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
136 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
137 return __all_sync(activemask(warp), predicate);
138# else
139 return __all(predicate);
140# endif
141 }
142 };
143
144 template<>
146 {
147 static __device__ auto any(
148 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
149 std::int32_t predicate) -> std::int32_t
150 {
151# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
152 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
153 return __any_sync(activemask(warp), predicate);
154# else
155 return __any(predicate);
156# endif
157 }
158 };
159
160 template<>
162 {
163 static __device__ auto ballot(
164 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
165 std::int32_t predicate) -> WarpUniformCudaHipBuiltIn::mask_type
166 {
167# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
168 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
169 return __ballot_sync(activemask(warp), predicate);
170# else
171 return __ballot(predicate);
172# endif
173 }
174 };
175
176 template<>
178 {
179 template<typename T>
180 static __device__ auto shfl(
181 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
182 T val,
183 int srcLane,
184 std::int32_t width) -> T
185 {
186# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
187 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
188 return __shfl_sync(activemask(warp), val, srcLane, width);
189# else
190 return __shfl(val, srcLane, width);
191# endif
192 }
193 };
194
195 template<>
197 {
198 template<typename T>
199 static __device__ auto shfl_up(
200 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
201 T val,
202 std::uint32_t offset,
203 std::int32_t width) -> T
204 {
205# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
206 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
207 return __shfl_up_sync(activemask(warp), val, offset, width);
208# else
209 return __shfl_up(val, offset, width);
210# endif
211 }
212 };
213
214 template<>
216 {
217 template<typename T>
218 static __device__ auto shfl_down(
219 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
220 T val,
221 std::uint32_t offset,
222 std::int32_t width) -> T
223 {
224# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
225 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
226 return __shfl_down_sync(activemask(warp), val, offset, width);
227# else
228 return __shfl_down(val, offset, width);
229# endif
230 }
231 };
232
233 template<>
235 {
236 template<typename T>
237 static __device__ auto shfl_xor(
238 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
239 T val,
240 std::int32_t mask,
241 std::int32_t width) -> T
242 {
243# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
244 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
245 return __shfl_xor_sync(activemask(warp), val, mask, width);
246# else
247 return __shfl_xor(val, mask, width);
248# endif
249 }
250 };
251
252 } // namespace trait
253# endif
254} // namespace alpaka::warp
255
256#endif
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> typename TWarp::mask_type
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
static __device__ auto activemask(warp::WarpUniformCudaHipBuiltIn const &) -> WarpUniformCudaHipBuiltIn::mask_type
The active mask trait.
Definition Traits.hpp:64
static __device__ auto all(warp::WarpUniformCudaHipBuiltIn const &warp, std::int32_t predicate) -> std::int32_t
The all warp vote trait.
Definition Traits.hpp:36
static __device__ auto any(warp::WarpUniformCudaHipBuiltIn const &warp, std::int32_t predicate) -> std::int32_t
The any warp vote trait.
Definition Traits.hpp:40
static __device__ auto ballot(warp::WarpUniformCudaHipBuiltIn const &warp, std::int32_t predicate) -> WarpUniformCudaHipBuiltIn::mask_type
The ballot warp vote trait.
Definition Traits.hpp:44
static constexpr __device__ auto getSizeCompileTime() -> std::int32_t
The compile-time warp size trait.
Definition Traits.hpp:28
static constexpr __device__ auto getSizeUpperLimit() -> std::int32_t
The warp size upper-limit trait.
Definition Traits.hpp:32
static __device__ auto getSize(warp::WarpUniformCudaHipBuiltIn const &) -> std::int32_t
The warp size trait.
Definition Traits.hpp:24
static __device__ auto shfl_down(warp::WarpUniformCudaHipBuiltIn const &warp, T val, std::uint32_t offset, std::int32_t width) -> T
The shfl down warp swizzling trait.
Definition Traits.hpp:56
static __device__ auto shfl_up(warp::WarpUniformCudaHipBuiltIn const &warp, T val, std::uint32_t offset, std::int32_t width) -> T
The shfl up warp swizzling trait.
Definition Traits.hpp:52
static __device__ auto shfl_xor(warp::WarpUniformCudaHipBuiltIn const &warp, T val, std::int32_t mask, std::int32_t width) -> T
The shfl xor warp swizzling trait.
Definition Traits.hpp:60
static __device__ auto shfl(warp::WarpUniformCudaHipBuiltIn const &warp, T val, int srcLane, std::int32_t width) -> T
The shfl warp swizzling trait.
Definition Traits.hpp:48