alpaka
Abstraction Library for Parallel Kernel Acceleration
Traits.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/core/Common.hpp"
9 
10 #include <cstdint>
11 #include <type_traits>
12 
13 namespace alpaka::warp
14 {
15  struct ConceptWarp
16  {
17  };
18 
19  //! The warp traits.
20  namespace trait
21  {
22  //! The warp size trait.
23  template<typename TWarp, typename TSfinae = void>
24  struct GetSize;
25 
26  //! The all warp vote trait.
27  template<typename TWarp, typename TSfinae = void>
28  struct All;
29 
30  //! The any warp vote trait.
31  template<typename TWarp, typename TSfinae = void>
32  struct Any;
33 
34  //! The ballot warp vote trait.
35  template<typename TWarp, typename TSfinae = void>
36  struct Ballot;
37 
38  //! The shfl warp swizzling trait.
39  template<typename TWarp, typename TSfinae = void>
40  struct Shfl;
41 
42  //! The shfl up warp swizzling trait.
43  template<typename TWarp, typename TSfinae = void>
44  struct ShflUp;
45 
46  //! The shfl down warp swizzling trait.
47  template<typename TWarp, typename TSfinae = void>
48  struct ShflDown;
49 
50  //! The shfl xor warp swizzling trait.
51  template<typename TWarp, typename TSfinae = void>
52  struct ShflXor;
53 
54  //! The active mask trait.
55  template<typename TWarp, typename TSfinae = void>
56  struct Activemask;
57  } // namespace trait
58 
59  //! Returns warp size.
60  //!
61  //! \tparam TWarp The warp implementation type.
62  //! \param warp The warp implementation.
64  template<typename TWarp>
65  ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
66  {
69  }
70 
71  //! Returns a 32- or 64-bit unsigned integer (depending on the
72  //! accelerator) whose Nth bit is set if and only if the Nth thread
73  //! of the warp is active.
74  //!
75  //! Note: decltype for return type is required there, otherwise
76  //! compilcation with a CPU and a GPU accelerator enabled fails as it
77  //! tries to call device function from a host-device one. The reason
78  //! is unclear, but likely related to deducing the return type.
79  //!
80  //! Note:
81  //! * The programmer must ensure that all threads calling this function are executing
82  //! the same line of code. In particular it is not portable to write
83  //! if(a) {activemask} else {activemask}.
84  //!
85  //! \tparam TWarp The warp implementation type.
86  //! \param warp The warp implementation.
87  //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
89  template<typename TWarp>
90  ALPAKA_FN_ACC auto activemask(TWarp const& warp)
92  {
95  }
96 
97  //! Evaluates predicate for all active threads of the warp and returns
98  //! non-zero if and only if predicate evaluates to non-zero for all of them.
99  //!
100  //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
101  //! the operation is applied for all active threads.
102  //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
103  //!
104  //! Note:
105  //! * The programmer must ensure that all threads calling this function are executing
106  //! the same line of code. In particular it is not portable to write
107  //! if(a) {all} else {all}.
108  //!
109  //! \tparam TWarp The warp implementation type.
110  //! \param warp The warp implementation.
111  //! \param predicate The predicate value for current thread.
113  template<typename TWarp>
114  ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
115  {
117  return trait::All<ImplementationBase>::all(warp, predicate);
118  }
119 
120  //! Evaluates predicate for all active threads of the warp and returns
121  //! non-zero if and only if predicate evaluates to non-zero for any of them.
122  //!
123  //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
124  //! the operation is applied for all active threads.
125  //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
126  //!
127  //! Note:
128  //! * The programmer must ensure that all threads calling this function are executing
129  //! the same line of code. In particular it is not portable to write
130  //! if(a) {any} else {any}.
131  //!
132  //! \tparam TWarp The warp implementation type.
133  //! \param warp The warp implementation.
134  //! \param predicate The predicate value for current thread.
136  template<typename TWarp>
137  ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
138  {
140  return trait::Any<ImplementationBase>::any(warp, predicate);
141  }
142 
143  //! Evaluates predicate for all non-exited threads in a warp and returns
144  //! a 32- or 64-bit unsigned integer (depending on the accelerator)
145  //! whose Nth bit is set if and only if predicate evaluates to non-zero
146  //! for the Nth thread of the warp and the Nth thread is active.
147  //!
148  //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
149  //! the operation is applied for all active threads.
150  //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
151  //! Return type is 64-bit to fit all platforms.
152  //!
153  //! Note:
154  //! * The programmer must ensure that all threads calling this function are executing
155  //! the same line of code. In particular it is not portable to write
156  //! if(a) {ballot} else {ballot}.
157  //!
158  //! \tparam TWarp The warp implementation type.
159  //! \param warp The warp implementation.
160  //! \param predicate The predicate value for current thread.
161  //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
163  template<typename TWarp>
164  ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate)
165  {
167  return trait::Ballot<ImplementationBase>::ballot(warp, predicate);
168  }
169 
170  //! Exchange data between threads within a warp.
171  //!
172  //! Effectively executes:
173  //!
174  //! __shared__ int32_t values[warpsize];
175  //! values[threadIdx.x] = value;
176  //! __syncthreads();
177  //! return values[width*(threadIdx.x/width) + srcLane%width];
178  //!
179  //! However, it does not use shared memory.
180  //!
181  //! Notes:
182  //! * The programmer must ensure that all threads calling this
183  //! function (and the srcLane) are executing the same line of code.
184  //! In particular it is not portable to write if(a) {shfl} else {shfl}.
185  //!
186  //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
187  //!
188  //! * Width must be a power of 2.
189  //!
190  //! \tparam TWarp warp implementation type
191  //! \param warp warp implementation
192  //! \param value value to broadcast (only meaningful from threadIdx == srcLane)
193  //! \param srcLane source lane sending value
194  //! \param width number of threads receiving a single value
195  //! \return val from the thread index srcLane.
197  template<typename TWarp, typename T>
198  ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
199  {
201  return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
202  }
203 
204  //! Exchange data between threads within a warp.
205  //! It copies from a lane with lower ID relative to caller.
206  //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
207  //!
208  //! Effectively executes:
209  //!
210  //! __shared__ int32_t values[warpsize];
211  //! values[threadIdx.x] = value;
212  //! __syncthreads();
213  //! return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
214  //!
215  //! However, it does not use shared memory.
216  //!
217  //! Notes:
218  //! * The programmer must ensure that all threads calling this
219  //! function (and the srcLane) are executing the same line of code.
220  //! In particular it is not portable to write if(a) {shfl} else {shfl}.
221  //!
222  //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
223  //! delta)
224  //!
225  //! * Width must be a power of 2.
226  //!
227  //! \tparam TWarp warp implementation type
228  //! \tparam T value type
229  //! \param warp warp implementation
230  //! \param value value to broadcast
231  //! \param offset corresponds to the delta used to compute the lane ID
232  //! \param width size of the group participating in the shuffle operation
233  //! \return val from the thread index lane ID.
235  template<typename TWarp, typename T>
236  ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
237  {
239  return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
240  }
241 
242  //! Exchange data between threads within a warp.
243  //! It copies from a lane with higher ID relative to caller.
244  //! The lane ID is calculated by adding delta to the caller’s lane ID.
245  //!
246  //! Effectively executes:
247  //!
248  //! __shared__ int32_t values[warpsize];
249  //! values[threadIdx.x] = value;
250  //! __syncthreads();
251  //! return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
252  //!
253  //! However, it does not use shared memory.
254  //!
255  //! Notes:
256  //! * The programmer must ensure that all threads calling this
257  //! function (and the srcLane) are executing the same line of code.
258  //! In particular it is not portable to write if(a) {shfl} else {shfl}.
259  //!
260  //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
261  //! warpsize)
262  //!
263  //! * Width must be a power of 2.
264  //!
265  //! \tparam TWarp warp implementation type
266  //! \tparam T value type
267  //! \param warp warp implementation
268  //! \param value value to broadcast
269  //! \param offset corresponds to the delta used to compute the lane ID
270  //! \param width size of the group participating in the shuffle operation
271  //! \return val from the thread index lane ID.
273  template<typename TWarp, typename T>
274  ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
275  {
277  return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
278  }
279 
280  //! Exchange data between threads within a warp.
281  //! It copies from a lane based on bitwise XOR of own lane ID.
282  //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
283  //!
284  //! Effectively executes:
285  //!
286  //! __shared__ int32_t values[warpsize];
287  //! values[threadIdx.x] = value;
288  //! __syncthreads();
289  //! int lane = threadIdx.x ^ mask;
290  //! return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
291  //!
292  //! However, it does not use shared memory.
293  //!
294  //! Notes:
295  //! * The programmer must ensure that all threads calling this
296  //! function (and the srcLane) are executing the same line of code.
297  //! In particular it is not portable to write if(a) {shfl} else {shfl}.
298  //!
299  //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
300  //!
301  //! * Width must be a power of 2.
302  //!
303  //! \tparam TWarp warp implementation type
304  //! \tparam T value type
305  //! \param warp warp implementation
306  //! \param value value to broadcast
307  //! \param mask corresponds to the mask used to compute the lane ID
308  //! \param width size of the group participating in the shuffle operation
309  //! \return val from the thread index lane ID.
311  template<typename TWarp, typename T>
312  ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
313  {
315  return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
316  }
317 } // namespace alpaka::warp
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition: Common.hpp:38
#define ALPAKA_NO_HOST_ACC_WARNING
Disable nvcc warning: 'calling a host function from host device function.' Usage: ALPAKA_NO_HOST_ACC_...
Definition: Common.hpp:82
typename detail::ImplementationBaseType< TConcept, TDerived >::type ImplementationBase
Returns the type that implements the given concept in the inheritance hierarchy.
Definition: Concepts.hpp:66
constexpr auto offset
Definition: Extent.hpp:34
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition: Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition: Traits.hpp:164
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition: Traits.hpp:236
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition: Traits.hpp:137
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition: Traits.hpp:274
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition: Traits.hpp:65
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< concepts::ImplementationBase< ConceptWarp, TWarp >>::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition: Traits.hpp:90
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition: Traits.hpp:312
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition: Traits.hpp:198
The active mask trait.
Definition: Traits.hpp:56
The all warp vote trait.
Definition: Traits.hpp:28
The any warp vote trait.
Definition: Traits.hpp:32
The ballot warp vote trait.
Definition: Traits.hpp:36
The warp size trait.
Definition: Traits.hpp:24
The shfl down warp swizzling trait.
Definition: Traits.hpp:48
The shfl up warp swizzling trait.
Definition: Traits.hpp:44
The shfl xor warp swizzling trait.
Definition: Traits.hpp:52
The shfl warp swizzling trait.
Definition: Traits.hpp:40