alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Traits.hpp
Go to the documentation of this file.
1/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
9
10#include <cstdint>
11#include <type_traits>
12
13namespace alpaka::warp
14{
16 {
17 };
18
19 //! The warp traits.
20 namespace trait
21 {
22 //! The warp size trait.
23 template<typename TWarp, typename TSfinae = void>
24 struct GetSize;
25
26 //! The compile-time warp size trait.
27 template<typename TWarp, typename TSfinae = void>
29
30 //! The warp size upper-limit trait.
31 template<typename TWarp, typename TSfinae = void>
33
34 //! The all warp vote trait.
35 template<typename TWarp, typename TSfinae = void>
36 struct All;
37
38 //! The any warp vote trait.
39 template<typename TWarp, typename TSfinae = void>
40 struct Any;
41
42 //! The ballot warp vote trait.
43 template<typename TWarp, typename TSfinae = void>
44 struct Ballot;
45
46 //! The shfl warp swizzling trait.
47 template<typename TWarp, typename TSfinae = void>
48 struct Shfl;
49
50 //! The shfl up warp swizzling trait.
51 template<typename TWarp, typename TSfinae = void>
52 struct ShflUp;
53
54 //! The shfl down warp swizzling trait.
55 template<typename TWarp, typename TSfinae = void>
56 struct ShflDown;
57
58 //! The shfl xor warp swizzling trait.
59 template<typename TWarp, typename TSfinae = void>
60 struct ShflXor;
61
62 //! The active mask trait.
63 template<typename TWarp, typename TSfinae = void>
64 struct Activemask;
65 } // namespace trait
66
67 //! Returns warp size.
68 //!
69 //! \tparam TWarp The warp implementation type.
70 //! \param warp The warp implementation.
72 template<typename TWarp>
73 ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
74 {
77 }
78
79 //! If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
80 //!
81 //! \tparam TWarp The warp implementation type.
83 template<typename TWarp>
89
90 //! If the warp size is available as a compile-time constant returns its value; otherwise returns an upper limit on
91 //! the possible warp size values.
92 //!
93 //! \tparam TWarp The warp implementation type.
95 template<typename TWarp>
101
102 //! Returns a 32- or 64-bit unsigned integer (depending on the
103 //! accelerator) whose Nth bit is set if and only if the Nth thread
104 //! of the warp is active.
105 //!
106 //! Note: decltype for return type is required there, otherwise
107 //! compilation with a CPU and a GPU accelerator enabled fails as it
108 //! tries to call device function from a host-device one. The reason
109 //! is unclear, but likely related to deducing the return type.
110 //!
111 //! Note:
112 //! * The programmer must ensure that all threads calling this function are executing
113 //! the same line of code. In particular it is not portable to write
114 //! if(a) {activemask} else {activemask}.
115 //!
116 //! \tparam TWarp The warp implementation type.
117 //! \param warp The warp implementation.
118 //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
120 template<typename TWarp>
127
128 //! Evaluates predicate for all active threads of the warp and returns
129 //! non-zero if and only if predicate evaluates to non-zero for all of them.
130 //!
131 //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
132 //! the operation is applied for all active threads.
133 //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
134 //!
135 //! Note:
136 //! * The programmer must ensure that all threads calling this function are executing
137 //! the same line of code. In particular it is not portable to write
138 //! if(a) {all} else {all}.
139 //!
140 //! \tparam TWarp The warp implementation type.
141 //! \param warp The warp implementation.
142 //! \param predicate The predicate value for current thread.
144 template<typename TWarp>
145 ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
146 {
147 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
148 return trait::All<ImplementationBase>::all(warp, predicate);
149 }
150
151 //! Evaluates predicate for all active threads of the warp and returns
152 //! non-zero if and only if predicate evaluates to non-zero for any of them.
153 //!
154 //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
155 //! the operation is applied for all active threads.
156 //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
157 //!
158 //! Note:
159 //! * The programmer must ensure that all threads calling this function are executing
160 //! the same line of code. In particular it is not portable to write
161 //! if(a) {any} else {any}.
162 //!
163 //! \tparam TWarp The warp implementation type.
164 //! \param warp The warp implementation.
165 //! \param predicate The predicate value for current thread.
167 template<typename TWarp>
168 ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
169 {
170 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
171 return trait::Any<ImplementationBase>::any(warp, predicate);
172 }
173
174 //! Evaluates predicate for all non-exited threads in a warp and returns
175 //! a 32- or 64-bit unsigned integer (depending on the accelerator)
176 //! whose Nth bit is set if and only if predicate evaluates to non-zero
177 //! for the Nth thread of the warp and the Nth thread is active.
178 //!
179 //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
180 //! the operation is applied for all active threads.
181 //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
182 //! Return type is 64-bit to fit all platforms.
183 //!
184 //! Note:
185 //! * The programmer must ensure that all threads calling this function are executing
186 //! the same line of code. In particular it is not portable to write
187 //! if(a) {ballot} else {ballot}.
188 //!
189 //! \tparam TWarp The warp implementation type.
190 //! \param warp The warp implementation.
191 //! \param predicate The predicate value for current thread.
192 //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
194 template<typename TWarp>
195 ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate)
196 {
197 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
198 return trait::Ballot<ImplementationBase>::ballot(warp, predicate);
199 }
200
201 //! Exchange data between threads within a warp.
202 //!
203 //! Effectively executes:
204 //!
205 //! __shared__ int32_t values[warpsize];
206 //! values[threadIdx.x] = value;
207 //! __syncthreads();
208 //! return values[width*(threadIdx.x/width) + srcLane%width];
209 //!
210 //! However, it does not use shared memory.
211 //!
212 //! Notes:
213 //! * The programmer must ensure that all threads calling this
214 //! function (and the srcLane) are executing the same line of code.
215 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
216 //!
217 //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
218 //!
219 //! * Width must be a power of 2.
220 //!
221 //! \tparam TWarp warp implementation type
222 //! \param warp warp implementation
223 //! \param value value to broadcast (only meaningful from threadIdx == srcLane)
224 //! \param srcLane source lane sending value
225 //! \param width number of threads receiving a single value
226 //! \return val from the thread index srcLane.
228 template<typename TWarp, typename T>
229 ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
230 {
231 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
232 return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
233 }
234
235 //! Exchange data between threads within a warp.
236 //! It copies from a lane with lower ID relative to caller.
237 //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
238 //!
239 //! Effectively executes:
240 //!
241 //! __shared__ int32_t values[warpsize];
242 //! values[threadIdx.x] = value;
243 //! __syncthreads();
244 //! return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
245 //!
246 //! However, it does not use shared memory.
247 //!
248 //! Notes:
249 //! * The programmer must ensure that all threads calling this
250 //! function (and the srcLane) are executing the same line of code.
251 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
252 //!
253 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
254 //! delta)
255 //!
256 //! * Width must be a power of 2.
257 //!
258 //! \tparam TWarp warp implementation type
259 //! \tparam T value type
260 //! \param warp warp implementation
261 //! \param value value to broadcast
262 //! \param offset corresponds to the delta used to compute the lane ID
263 //! \param width size of the group participating in the shuffle operation
264 //! \return val from the thread index lane ID.
266 template<typename TWarp, typename T>
267 ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
268 {
269 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
270 return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
271 }
272
273 //! Exchange data between threads within a warp.
274 //! It copies from a lane with higher ID relative to caller.
275 //! The lane ID is calculated by adding delta to the caller’s lane ID.
276 //!
277 //! Effectively executes:
278 //!
279 //! __shared__ int32_t values[warpsize];
280 //! values[threadIdx.x] = value;
281 //! __syncthreads();
282 //! return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
283 //!
284 //! However, it does not use shared memory.
285 //!
286 //! Notes:
287 //! * The programmer must ensure that all threads calling this
288 //! function (and the srcLane) are executing the same line of code.
289 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
290 //!
291 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
292 //! warpsize)
293 //!
294 //! * Width must be a power of 2.
295 //!
296 //! \tparam TWarp warp implementation type
297 //! \tparam T value type
298 //! \param warp warp implementation
299 //! \param value value to broadcast
300 //! \param offset corresponds to the delta used to compute the lane ID
301 //! \param width size of the group participating in the shuffle operation
302 //! \return val from the thread index lane ID.
304 template<typename TWarp, typename T>
305 ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
306 {
307 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
308 return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
309 }
310
311 //! Exchange data between threads within a warp.
312 //! It copies from a lane based on bitwise XOR of own lane ID.
313 //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
314 //!
315 //! Effectively executes:
316 //!
317 //! __shared__ int32_t values[warpsize];
318 //! values[threadIdx.x] = value;
319 //! __syncthreads();
320 //! int lane = threadIdx.x ^ mask;
321 //! return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
322 //!
323 //! However, it does not use shared memory.
324 //!
325 //! Notes:
326 //! * The programmer must ensure that all threads calling this
327 //! function (and the srcLane) are executing the same line of code.
328 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
329 //!
330 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
331 //!
332 //! * Width must be a power of 2.
333 //!
334 //! \tparam TWarp warp implementation type
335 //! \tparam T value type
336 //! \param warp warp implementation
337 //! \param value value to broadcast
338 //! \param mask corresponds to the mask used to compute the lane ID
339 //! \param width size of the group participating in the shuffle operation
340 //! \return val from the thread index lane ID.
342 template<typename TWarp, typename T>
343 ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
344 {
345 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
346 return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
347 }
348} // namespace alpaka::warp
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition Common.hpp:38
#define ALPAKA_NO_HOST_ACC_WARNING
Disable nvcc warning: 'calling a host function from host device function.' Usage: ALPAKA_NO_HOST_ACC_...
Definition Common.hpp:82
typename detail::ImplementationBaseType< TInterface, TDerived >::type ImplementationBase
Returns the type that implements the given interface in the inheritance hierarchy.
Definition Interface.hpp:66
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:145
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:195
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:267
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:168
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:305
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:343
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:229
The active mask trait.
Definition Traits.hpp:64
The all warp vote trait.
Definition Traits.hpp:36
The any warp vote trait.
Definition Traits.hpp:40
The ballot warp vote trait.
Definition Traits.hpp:44
The compile-time warp size trait.
Definition Traits.hpp:28
The warp size upper-limit trait.
Definition Traits.hpp:32
The warp size trait.
Definition Traits.hpp:24
The shfl down warp swizzling trait.
Definition Traits.hpp:56
The shfl up warp swizzling trait.
Definition Traits.hpp:52
The shfl xor warp swizzling trait.
Definition Traits.hpp:60
The shfl warp swizzling trait.
Definition Traits.hpp:48