alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpGenericSycl.hpp
Go to the documentation of this file.
1/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 *
4 * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
5 * from Intel DPCT.
6 * Copyright (C) Intel Corporation.
7 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 * See https://llvm.org/LICENSE.txt for license information.
9 */
10
11#pragma once
12
15
16#include <cstdint>
17
18#ifdef ALPAKA_ACC_SYCL_ENABLED
19
20# include <sycl/sycl.hpp>
21
22namespace alpaka::warp
23{
24 //! The SYCL warp.
25 template<typename TDim>
26 class WarpGenericSycl : public interface::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
27 {
28 public:
29 WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
30 {
31 }
32
33 sycl::nd_item<TDim::value> m_item_warp;
34 };
35} // namespace alpaka::warp
36
37namespace alpaka::warp::trait
38{
39 template<typename TDim>
40 struct GetSize<warp::WarpGenericSycl<TDim>>
41 {
42 static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
43 {
44 auto const sub_group = warp.m_item_warp.get_sub_group();
45 // SYCL sub-groups are always 1D
46 return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
47 }
48 };
49
50 template<typename TDim>
51 struct GetSizeCompileTime<warp::WarpGenericSycl<TDim>>
52 {
53 static constexpr auto getSizeCompileTime() -> std::int32_t
54 {
55 // SYCL sub-groups size is usually not known at compile time
56 return 0;
57 }
58 };
59
60 template<typename TDim>
61 struct GetSizeUpperLimit<warp::WarpGenericSycl<TDim>>
62 {
63 static constexpr auto getSizeUpperLimit() -> std::int32_t
64 {
65 // See include/alpaka/kernel/SyclSubgroupSize.hpp for possible sub-group sizes.
66 return 64;
67 }
68 };
69
70 template<typename TDim>
71 struct Activemask<warp::WarpGenericSycl<TDim>>
72 {
73 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
74 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
75 // Restrict to warpSize <= 32 for now.
76 static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
77 {
78 static_assert(!sizeof(warp), "activemask is not supported on SYCL");
79 // SYCL does not have an API to get the activemask. It is also questionable (to me, bgruber) whether an
80 // "activemask" even exists on some hardware architectures, since the idea is bound to threads being
81 // "turned off" when they take different control flow in a warp. A SYCL implementation could run each
82 // thread as a SIMD lane, in which cause the "thread" is always active, but some SIMD lanes are either
83 // predicated off, or side-effects are masked out when writing them back.
84 //
85 // An implementation via oneAPI's sycl::ext::oneapi::group_ballot causes UB, because activemask is expected
86 // to be callable when less than all threads are active in a warp (CUDA). But SYCL requires all threads of
87 // a group to call the function.
88 //
89 // Intel's CUDA -> SYCL migration tool also suggests that there is no direct equivalent and the user must
90 // rewrite their kernel logic. See also:
91 // https://oneapi-src.github.io/SYCLomatic/dev_guide/diagnostic_ref/dpct1086.html
92
93 return ~std::uint32_t{0};
94 }
95 };
96
97 template<typename TDim>
98 struct All<warp::WarpGenericSycl<TDim>>
99 {
100 static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
101 {
102 auto const sub_group = warp.m_item_warp.get_sub_group();
103 return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
104 }
105 };
106
107 template<typename TDim>
108 struct Any<warp::WarpGenericSycl<TDim>>
109 {
110 static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
111 {
112 auto const sub_group = warp.m_item_warp.get_sub_group();
113 return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
114 }
115 };
116
117 template<typename TDim>
118 struct Ballot<warp::WarpGenericSycl<TDim>>
119 {
120 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
121 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
122 // Restrict to warpSize <= 32 for now.
123 static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
124 {
125 auto const sub_group = warp.m_item_warp.get_sub_group();
126 auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
127 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
128 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
129 // Restrict to warpSize <= 32 for now.
130 std::uint32_t bits = 0;
131 mask.extract_bits(bits);
132 return bits;
133 }
134 };
135
136 template<typename TDim>
137 struct Shfl<warp::WarpGenericSycl<TDim>>
138 {
139 template<typename T>
140 static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
141 {
142 ALPAKA_ASSERT_ACC(width > 0);
143 ALPAKA_ASSERT_ACC(srcLane >= 0);
144
145 /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
146 subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
147
148 Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
149 The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
150 first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
151 auto const actual_group = warp.m_item_warp.get_sub_group();
152 std::uint32_t const w = static_cast<std::uint32_t>(width);
153 std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
154 return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
155 }
156 };
157
158 template<typename TDim>
159 struct ShflUp<warp::WarpGenericSycl<TDim>>
160 {
161 template<typename T>
162 static auto shfl_up(
163 warp::WarpGenericSycl<TDim> const& warp,
164 T value,
165 std::uint32_t offset, /* must be the same for all work-items in the group */
166 std::int32_t width)
167 {
168 auto const actual_group = warp.m_item_warp.get_sub_group();
169 std::uint32_t const w = static_cast<std::uint32_t>(width);
170 std::uint32_t const id = actual_group.get_local_linear_id();
171 std::uint32_t const start_index = id / w * w;
172 T result = sycl::shift_group_right(actual_group, value, offset);
173 if((id - start_index) < offset)
174 {
175 result = value;
176 }
177 return result;
178 }
179 };
180
181 template<typename TDim>
182 struct ShflDown<warp::WarpGenericSycl<TDim>>
183 {
184 template<typename T>
185 static auto shfl_down(
186 warp::WarpGenericSycl<TDim> const& warp,
187 T value,
188 std::uint32_t offset,
189 std::int32_t width)
190 {
191 auto const actual_group = warp.m_item_warp.get_sub_group();
192 std::uint32_t const w = static_cast<std::uint32_t>(width);
193 std::uint32_t const id = actual_group.get_local_linear_id();
194 std::uint32_t const end_index = (id / w + 1) * w;
195 T result = sycl::shift_group_left(actual_group, value, offset);
196 if((id + offset) >= end_index)
197 {
198 result = value;
199 }
200 return result;
201 }
202 };
203
204 template<typename TDim>
205 struct ShflXor<warp::WarpGenericSycl<TDim>>
206 {
207 template<typename T>
208 static auto shfl_xor(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t mask, std::int32_t width)
209 {
210 auto const actual_group = warp.m_item_warp.get_sub_group();
211 std::uint32_t const w = static_cast<std::uint32_t>(width);
212 std::uint32_t const id = actual_group.get_local_linear_id();
213 std::uint32_t const start_index = id / w * w;
214 std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
215 return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
216 }
217 };
218} // namespace alpaka::warp::trait
219
220#endif
#define ALPAKA_ASSERT_ACC(...)
ALPAKA_ASSERT_ACC is an assert-like macro.
Definition Assert.hpp:52
The warp traits.
Definition Traits.hpp:21
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:145
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:195
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:267
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:168
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:305
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:343
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:229
STL namespace.