alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Traits.hpp
Go to the documentation of this file.
1/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
9
10#include <cstdint>
11#include <type_traits>
12
13namespace alpaka::warp
14{
16 {
17 };
18
19 //! The warp traits.
20 namespace trait
21 {
22 //! The warp size trait.
23 template<typename TWarp, typename TSfinae = void>
24 struct GetSize;
25
26 //! The all warp vote trait.
27 template<typename TWarp, typename TSfinae = void>
28 struct All;
29
30 //! The any warp vote trait.
31 template<typename TWarp, typename TSfinae = void>
32 struct Any;
33
34 //! The ballot warp vote trait.
35 template<typename TWarp, typename TSfinae = void>
36 struct Ballot;
37
38 //! The shfl warp swizzling trait.
39 template<typename TWarp, typename TSfinae = void>
40 struct Shfl;
41
42 //! The shfl up warp swizzling trait.
43 template<typename TWarp, typename TSfinae = void>
44 struct ShflUp;
45
46 //! The shfl down warp swizzling trait.
47 template<typename TWarp, typename TSfinae = void>
48 struct ShflDown;
49
50 //! The shfl xor warp swizzling trait.
51 template<typename TWarp, typename TSfinae = void>
52 struct ShflXor;
53
54 //! The active mask trait.
55 template<typename TWarp, typename TSfinae = void>
56 struct Activemask;
57 } // namespace trait
58
59 //! Returns warp size.
60 //!
61 //! \tparam TWarp The warp implementation type.
62 //! \param warp The warp implementation.
64 template<typename TWarp>
65 ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
66 {
69 }
70
71 //! Returns a 32- or 64-bit unsigned integer (depending on the
72 //! accelerator) whose Nth bit is set if and only if the Nth thread
73 //! of the warp is active.
74 //!
75 //! Note: decltype for return type is required there, otherwise
76 //! compilcation with a CPU and a GPU accelerator enabled fails as it
77 //! tries to call device function from a host-device one. The reason
78 //! is unclear, but likely related to deducing the return type.
79 //!
80 //! Note:
81 //! * The programmer must ensure that all threads calling this function are executing
82 //! the same line of code. In particular it is not portable to write
83 //! if(a) {activemask} else {activemask}.
84 //!
85 //! \tparam TWarp The warp implementation type.
86 //! \param warp The warp implementation.
87 //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
89 template<typename TWarp>
96
97 //! Evaluates predicate for all active threads of the warp and returns
98 //! non-zero if and only if predicate evaluates to non-zero for all of them.
99 //!
100 //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
101 //! the operation is applied for all active threads.
102 //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
103 //!
104 //! Note:
105 //! * The programmer must ensure that all threads calling this function are executing
106 //! the same line of code. In particular it is not portable to write
107 //! if(a) {all} else {all}.
108 //!
109 //! \tparam TWarp The warp implementation type.
110 //! \param warp The warp implementation.
111 //! \param predicate The predicate value for current thread.
113 template<typename TWarp>
114 ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
115 {
116 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
117 return trait::All<ImplementationBase>::all(warp, predicate);
118 }
119
120 //! Evaluates predicate for all active threads of the warp and returns
121 //! non-zero if and only if predicate evaluates to non-zero for any of them.
122 //!
123 //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
124 //! the operation is applied for all active threads.
125 //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
126 //!
127 //! Note:
128 //! * The programmer must ensure that all threads calling this function are executing
129 //! the same line of code. In particular it is not portable to write
130 //! if(a) {any} else {any}.
131 //!
132 //! \tparam TWarp The warp implementation type.
133 //! \param warp The warp implementation.
134 //! \param predicate The predicate value for current thread.
136 template<typename TWarp>
137 ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
138 {
139 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
140 return trait::Any<ImplementationBase>::any(warp, predicate);
141 }
142
143 //! Evaluates predicate for all non-exited threads in a warp and returns
144 //! a 32- or 64-bit unsigned integer (depending on the accelerator)
145 //! whose Nth bit is set if and only if predicate evaluates to non-zero
146 //! for the Nth thread of the warp and the Nth thread is active.
147 //!
148 //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
149 //! the operation is applied for all active threads.
150 //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
151 //! Return type is 64-bit to fit all platforms.
152 //!
153 //! Note:
154 //! * The programmer must ensure that all threads calling this function are executing
155 //! the same line of code. In particular it is not portable to write
156 //! if(a) {ballot} else {ballot}.
157 //!
158 //! \tparam TWarp The warp implementation type.
159 //! \param warp The warp implementation.
160 //! \param predicate The predicate value for current thread.
161 //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
163 template<typename TWarp>
164 ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate)
165 {
166 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
167 return trait::Ballot<ImplementationBase>::ballot(warp, predicate);
168 }
169
170 //! Exchange data between threads within a warp.
171 //!
172 //! Effectively executes:
173 //!
174 //! __shared__ int32_t values[warpsize];
175 //! values[threadIdx.x] = value;
176 //! __syncthreads();
177 //! return values[width*(threadIdx.x/width) + srcLane%width];
178 //!
179 //! However, it does not use shared memory.
180 //!
181 //! Notes:
182 //! * The programmer must ensure that all threads calling this
183 //! function (and the srcLane) are executing the same line of code.
184 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
185 //!
186 //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
187 //!
188 //! * Width must be a power of 2.
189 //!
190 //! \tparam TWarp warp implementation type
191 //! \param warp warp implementation
192 //! \param value value to broadcast (only meaningful from threadIdx == srcLane)
193 //! \param srcLane source lane sending value
194 //! \param width number of threads receiving a single value
195 //! \return val from the thread index srcLane.
197 template<typename TWarp, typename T>
198 ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
199 {
200 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
201 return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
202 }
203
204 //! Exchange data between threads within a warp.
205 //! It copies from a lane with lower ID relative to caller.
206 //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
207 //!
208 //! Effectively executes:
209 //!
210 //! __shared__ int32_t values[warpsize];
211 //! values[threadIdx.x] = value;
212 //! __syncthreads();
213 //! return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
214 //!
215 //! However, it does not use shared memory.
216 //!
217 //! Notes:
218 //! * The programmer must ensure that all threads calling this
219 //! function (and the srcLane) are executing the same line of code.
220 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
221 //!
222 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
223 //! delta)
224 //!
225 //! * Width must be a power of 2.
226 //!
227 //! \tparam TWarp warp implementation type
228 //! \tparam T value type
229 //! \param warp warp implementation
230 //! \param value value to broadcast
231 //! \param offset corresponds to the delta used to compute the lane ID
232 //! \param width size of the group participating in the shuffle operation
233 //! \return val from the thread index lane ID.
235 template<typename TWarp, typename T>
236 ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
237 {
238 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
239 return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
240 }
241
242 //! Exchange data between threads within a warp.
243 //! It copies from a lane with higher ID relative to caller.
244 //! The lane ID is calculated by adding delta to the caller’s lane ID.
245 //!
246 //! Effectively executes:
247 //!
248 //! __shared__ int32_t values[warpsize];
249 //! values[threadIdx.x] = value;
250 //! __syncthreads();
251 //! return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
252 //!
253 //! However, it does not use shared memory.
254 //!
255 //! Notes:
256 //! * The programmer must ensure that all threads calling this
257 //! function (and the srcLane) are executing the same line of code.
258 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
259 //!
260 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
261 //! warpsize)
262 //!
263 //! * Width must be a power of 2.
264 //!
265 //! \tparam TWarp warp implementation type
266 //! \tparam T value type
267 //! \param warp warp implementation
268 //! \param value value to broadcast
269 //! \param offset corresponds to the delta used to compute the lane ID
270 //! \param width size of the group participating in the shuffle operation
271 //! \return val from the thread index lane ID.
273 template<typename TWarp, typename T>
274 ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
275 {
276 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
277 return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
278 }
279
280 //! Exchange data between threads within a warp.
281 //! It copies from a lane based on bitwise XOR of own lane ID.
282 //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
283 //!
284 //! Effectively executes:
285 //!
286 //! __shared__ int32_t values[warpsize];
287 //! values[threadIdx.x] = value;
288 //! __syncthreads();
289 //! int lane = threadIdx.x ^ mask;
290 //! return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
291 //!
292 //! However, it does not use shared memory.
293 //!
294 //! Notes:
295 //! * The programmer must ensure that all threads calling this
296 //! function (and the srcLane) are executing the same line of code.
297 //! In particular it is not portable to write if(a) {shfl} else {shfl}.
298 //!
299 //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
300 //!
301 //! * Width must be a power of 2.
302 //!
303 //! \tparam TWarp warp implementation type
304 //! \tparam T value type
305 //! \param warp warp implementation
306 //! \param value value to broadcast
307 //! \param mask corresponds to the mask used to compute the lane ID
308 //! \param width size of the group participating in the shuffle operation
309 //! \return val from the thread index lane ID.
311 template<typename TWarp, typename T>
312 ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
313 {
314 using ImplementationBase = interface::ImplementationBase<ConceptWarp, TWarp>;
315 return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
316 }
317} // namespace alpaka::warp
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition Common.hpp:38
#define ALPAKA_NO_HOST_ACC_WARNING
Disable nvcc warning: 'calling a host function from host device function.' Usage: ALPAKA_NO_HOST_ACC_...
Definition Common.hpp:82
typename detail::ImplementationBaseType< TInterface, TDerived >::type ImplementationBase
Returns the type that implements the given interface in the inheritance hierarchy.
Definition Interface.hpp:66
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:164
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:236
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:90
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:137
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:274
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:65
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:312
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:198
The active mask trait.
Definition Traits.hpp:56
The all warp vote trait.
Definition Traits.hpp:28
The any warp vote trait.
Definition Traits.hpp:32
The ballot warp vote trait.
Definition Traits.hpp:36
The warp size trait.
Definition Traits.hpp:24
The shfl down warp swizzling trait.
Definition Traits.hpp:48
The shfl up warp swizzling trait.
Definition Traits.hpp:44
The shfl xor warp swizzling trait.
Definition Traits.hpp:52
The shfl warp swizzling trait.
Definition Traits.hpp:40