alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Traits.hpp
Go to the documentation of this file.
1/* Copyright 2026 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego, Simone Balducci
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
9
10#include <cstdint>
11#include <type_traits>
12
13namespace alpaka::warp
14{
16 {
17 };
18
19 //! The warp traits.
20 namespace trait
21 {
22 //! The warp size trait.
23 template<typename TWarp, typename TSfinae = void>
24 struct GetSize;
25
26 //! The compile-time warp size trait.
27 template<typename TWarp, typename TSfinae = void>
29
30 //! The warp size upper-limit trait.
31 template<typename TWarp, typename TSfinae = void>
33
34 //! The all warp vote trait.
35 template<typename TWarp, typename TSfinae = void>
36 struct All;
37
38 //! The any warp vote trait.
39 template<typename TWarp, typename TSfinae = void>
40 struct Any;
41
42 //! The ballot warp vote trait.
43 template<typename TWarp, typename TSfinae = void>
44 struct Ballot;
45
46 //! The shfl warp swizzling trait.
47 template<typename TWarp, typename TSfinae = void>
48 struct Shfl;
49
50 //! The shfl up warp swizzling trait.
51 template<typename TWarp, typename TSfinae = void>
52 struct ShflUp;
53
54 //! The shfl down warp swizzling trait.
55 template<typename TWarp, typename TSfinae = void>
56 struct ShflDown;
57
58 //! The shfl xor warp swizzling trait.
59 template<typename TWarp, typename TSfinae = void>
60 struct ShflXor;
61
62 //! The active mask trait.
63 template<typename TWarp, typename TSfinae = void>
64 struct Activemask;
65 } // namespace trait
66
67 //! Returns warp size.
68 //!
69 //! \tparam TWarp The warp implementation type.
70 //! \param warp The warp implementation.
72 template<typename TWarp>
73 ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
74 {
77 }
78
79 //! If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
80 //!
81 //! \tparam TWarp The warp implementation type.
83 template<typename TWarp>
89
90 //! If the warp size is available as a compile-time constant returns its value; otherwise returns an upper limit on
91 //! the possible warp size values.
92 //!
93 //! \tparam TWarp The warp implementation type.
95 template<typename TWarp>
101
102 //! Returns a 32- or 64-bit unsigned integer (depending on the
103 //! accelerator) whose Nth bit is set if and only if the Nth thread
104 //! of the warp is active.
105 //!
106 //! Note: decltype for return type is required there, otherwise
107 //! compilation with a CPU and a GPU accelerator enabled fails as it
108 //! tries to call device function from a host-device one. The reason
109 //! is unclear, but likely related to deducing the return type.
110 //!
111 //! Note:
112 //! * The programmer must ensure that all threads calling this function are executing
113 //! the same line of code. In particular it is not portable to write
114 //! if(a) {activemask} else {activemask}.
115 //!
116 //! \tparam TWarp The warp implementation type.
117 //! \param warp The warp implementation.
118 //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
120 template<typename TWarp>
121 ALPAKA_FN_ACC auto activemask(TWarp const& warp) -> typename TWarp::mask_type
122 {
123 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
125 }
126
127 //! Evaluates predicate for all active threads of the warp and returns
128 //! non-zero if and only if predicate evaluates to non-zero for all of them.
129 //!
130 //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
131 //! the operation is applied for all active threads.
132 //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
133 //!
134 //! Note:
135 //! * The programmer must ensure that all threads calling this function are executing
136 //! the same line of code. In particular it is not portable to write
137 //! if(a) {all} else {all}.
138 //!
139 //! \tparam TWarp The warp implementation type.
140 //! \param warp The warp implementation.
141 //! \param predicate The predicate value for current thread.
143 template<typename TWarp>
144 ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
145 {
146 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
147 return trait::All<ImplementationBase>::all(warp, predicate);
148 }
149
150 //! Evaluates predicate for all active threads of the warp and returns
151 //! non-zero if and only if predicate evaluates to non-zero for any of them.
152 //!
153 //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
154 //! the operation is applied for all active threads.
155 //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
156 //!
157 //! Note:
158 //! * The programmer must ensure that all threads calling this function are executing
159 //! the same line of code. In particular it is not portable to write
160 //! if(a) {any} else {any}.
161 //!
162 //! \tparam TWarp The warp implementation type.
163 //! \param warp The warp implementation.
164 //! \param predicate The predicate value for current thread.
166 template<typename TWarp>
167 ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
168 {
169 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
170 return trait::Any<ImplementationBase>::any(warp, predicate);
171 }
172
173 //! Evaluates predicate for all non-exited threads in a warp and returns
174 //! a 32- or 64-bit unsigned integer (depending on the accelerator)
175 //! whose Nth bit is set if and only if predicate evaluates to non-zero
176 //! for the Nth thread of the warp and the Nth thread is active.
177 //!
178 //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
179 //! the operation is applied for all active threads.
180 //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
181 //! Return type is 64-bit to fit all platforms.
182 //!
183 //! Note:
184 //! * The programmer must ensure that all threads calling this function are executing
185 //! the same line of code. In particular it is not portable to write
186 //! if(a) {ballot} else {ballot}.
187 //!
188 //! \tparam TWarp The warp implementation type.
189 //! \param warp The warp implementation.
190 //! \param predicate The predicate value for current thread.
191 //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
193 template<typename TWarp>
194 ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate) -> typename TWarp::mask_type
195 {
196 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
197 return trait::Ballot<ImplementationBase>::ballot(warp, predicate);
198 }
199
200 //! Exchange data between threads within a warp.
201 //!
202 //! Effectively executes:
203 //!
204 //! __shared__ int32_t values[warpsize];
205 //! values[threadIdx.x] = value;
206 //! __syncthreads();
207 //! return values[width*(threadIdx.x/width) + srcLane%width];
208 //!
209 //! However, it does not use shared memory.
210 //!
211 //! Notes:
212 //! * The programmer must ensure that all threads calling this
213 //! function (and the srcLane) are executing the same line of code.
214 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
215 //!
216 //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
217 //!
218 //! * Width must be a power of 2.
219 //!
220 //! \tparam TWarp warp implementation type
221 //! \param warp warp implementation
222 //! \param value value to broadcast (only meaningful from threadIdx == srcLane)
223 //! \param srcLane source lane sending value
224 //! \param width number of threads receiving a single value
225 //! \return val from the thread index srcLane.
227 template<typename TWarp, typename T>
228 ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
229 {
230 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
231 return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
232 }
233
234 //! Exchange data between threads within a warp.
235 //! It copies from a lane with lower ID relative to caller.
236 //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
237 //!
238 //! Effectively executes:
239 //!
240 //! __shared__ int32_t values[warpsize];
241 //! values[threadIdx.x] = value;
242 //! __syncthreads();
243 //! return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
244 //!
245 //! However, it does not use shared memory.
246 //!
247 //! Notes:
248 //! * The programmer must ensure that all threads calling this
249 //! function (and the srcLane) are executing the same line of code.
250 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
251 //!
252 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
253 //! delta)
254 //!
255 //! * Width must be a power of 2.
256 //!
257 //! \tparam TWarp warp implementation type
258 //! \tparam T value type
259 //! \param warp warp implementation
260 //! \param value value to broadcast
261 //! \param offset corresponds to the delta used to compute the lane ID
262 //! \param width size of the group participating in the shuffle operation
263 //! \return val from the thread index lane ID.
265 template<typename TWarp, typename T>
266 ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
267 {
268 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
269 return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
270 }
271
272 //! Exchange data between threads within a warp.
273 //! It copies from a lane with higher ID relative to caller.
274 //! The lane ID is calculated by adding delta to the caller’s lane ID.
275 //!
276 //! Effectively executes:
277 //!
278 //! __shared__ int32_t values[warpsize];
279 //! values[threadIdx.x] = value;
280 //! __syncthreads();
281 //! return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
282 //!
283 //! However, it does not use shared memory.
284 //!
285 //! Notes:
286 //! * The programmer must ensure that all threads calling this
287 //! function (and the srcLane) are executing the same line of code.
288 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
289 //!
290 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
291 //! warpsize)
292 //!
293 //! * Width must be a power of 2.
294 //!
295 //! \tparam TWarp warp implementation type
296 //! \tparam T value type
297 //! \param warp warp implementation
298 //! \param value value to broadcast
299 //! \param offset corresponds to the delta used to compute the lane ID
300 //! \param width size of the group participating in the shuffle operation
301 //! \return val from the thread index lane ID.
303 template<typename TWarp, typename T>
304 ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
305 {
306 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
307 return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
308 }
309
310 //! Exchange data between threads within a warp.
311 //! It copies from a lane based on bitwise XOR of own lane ID.
312 //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
313 //!
314 //! Effectively executes:
315 //!
316 //! __shared__ int32_t values[warpsize];
317 //! values[threadIdx.x] = value;
318 //! __syncthreads();
319 //! int lane = threadIdx.x ^ mask;
320 //! return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
321 //!
322 //! However, it does not use shared memory.
323 //!
324 //! Notes:
325 //! * The programmer must ensure that all threads calling this
326 //! function (and the srcLane) are executing the same line of code.
327 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
328 //!
329 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
330 //!
331 //! * Width must be a power of 2.
332 //!
333 //! \tparam TWarp warp implementation type
334 //! \tparam T value type
335 //! \param warp warp implementation
336 //! \param value value to broadcast
337 //! \param mask corresponds to the mask used to compute the lane ID
338 //! \param width size of the group participating in the shuffle operation
339 //! \return val from the thread index lane ID.
341 template<typename TWarp, typename T>
342 ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
343 {
344 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
345 return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
346 }
347} // namespace alpaka::warp
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition Common.hpp:41
#define ALPAKA_NO_HOST_ACC_WARNING
Disable nvcc warning: 'calling a host function from host device function.' Usage: ALPAKA_NO_HOST_ACC_...
Definition Common.hpp:85
typename detail::ImplementationBaseType< TInterface, TDerived >::type ImplementationBase
Returns the type that implements the given interface in the inheritance hierarchy.
Definition Interface.hpp:66
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate) -> typename TWarp::mask_type
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:194
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:144
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:266
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:167
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:304
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:342
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> typename TWarp::mask_type
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:228
The active mask trait.
Definition Traits.hpp:64
The all warp vote trait.
Definition Traits.hpp:36
The any warp vote trait.
Definition Traits.hpp:40
The ballot warp vote trait.
Definition Traits.hpp:44
The compile-time warp size trait.
Definition Traits.hpp:28
The warp size upper-limit trait.
Definition Traits.hpp:32
The warp size trait.
Definition Traits.hpp:24
The shfl down warp swizzling trait.
Definition Traits.hpp:56
The shfl up warp swizzling trait.
Definition Traits.hpp:52
The shfl xor warp swizzling trait.
Definition Traits.hpp:60
The shfl warp swizzling trait.
Definition Traits.hpp:48