alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpGenericSycl.hpp
Go to the documentation of this file.
1/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 *
4 * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
5 * from Intel DPCT.
6 * Copyright (C) Intel Corporation.
7 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 * See https://llvm.org/LICENSE.txt for license information.
9 */
10
11#pragma once
12
15
16#include <cstdint>
17
18#ifdef ALPAKA_ACC_SYCL_ENABLED
19
20# include <sycl/sycl.hpp>
21
22namespace alpaka::warp
23{
24 //! The SYCL warp.
25 template<typename TDim>
26 class WarpGenericSycl : public interface::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
27 {
28 public:
29 WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
30 {
31 }
32
33 sycl::nd_item<TDim::value> m_item_warp;
34 };
35} // namespace alpaka::warp
36
37namespace alpaka::warp::trait
38{
39 template<typename TDim>
40 struct GetSize<warp::WarpGenericSycl<TDim>>
41 {
42 static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
43 {
44 auto const sub_group = warp.m_item_warp.get_sub_group();
45 // SYCL sub-groups are always 1D
46 return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
47 }
48 };
49
50 template<typename TDim>
51 struct Activemask<warp::WarpGenericSycl<TDim>>
52 {
53 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
54 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
55 // Restrict to warpSize <= 32 for now.
56 static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
57 {
58 static_assert(!sizeof(warp), "activemask is not supported on SYCL");
59 // SYCL does not have an API to get the activemask. It is also questionable (to me, bgruber) whether an
60 // "activemask" even exists on some hardware architectures, since the idea is bound to threads being
61 // "turned off" when they take different control flow in a warp. A SYCL implementation could run each
62 // thread as a SIMD lane, in which cause the "thread" is always active, but some SIMD lanes are either
63 // predicated off, or side-effects are masked out when writing them back.
64 //
65 // An implementation via oneAPI's sycl::ext::oneapi::group_ballot causes UB, because activemask is expected
66 // to be callable when less than all threads are active in a warp (CUDA). But SYCL requires all threads of
67 // a group to call the function.
68 //
69 // Intel's CUDA -> SYCL migration tool also suggests that there is no direct equivalent and the user must
70 // rewrite their kernel logic. See also:
71 // https://oneapi-src.github.io/SYCLomatic/dev_guide/diagnostic_ref/dpct1086.html
72
73 return ~std::uint32_t{0};
74 }
75 };
76
77 template<typename TDim>
78 struct All<warp::WarpGenericSycl<TDim>>
79 {
80 static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
81 {
82 auto const sub_group = warp.m_item_warp.get_sub_group();
83 return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
84 }
85 };
86
87 template<typename TDim>
88 struct Any<warp::WarpGenericSycl<TDim>>
89 {
90 static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
91 {
92 auto const sub_group = warp.m_item_warp.get_sub_group();
93 return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
94 }
95 };
96
97 template<typename TDim>
98 struct Ballot<warp::WarpGenericSycl<TDim>>
99 {
100 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
101 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
102 // Restrict to warpSize <= 32 for now.
103 static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
104 {
105 auto const sub_group = warp.m_item_warp.get_sub_group();
106 auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
107 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
108 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
109 // Restrict to warpSize <= 32 for now.
110 std::uint32_t bits = 0;
111 mask.extract_bits(bits);
112 return bits;
113 }
114 };
115
116 template<typename TDim>
117 struct Shfl<warp::WarpGenericSycl<TDim>>
118 {
119 template<typename T>
120 static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
121 {
122 ALPAKA_ASSERT_ACC(width > 0);
123 ALPAKA_ASSERT_ACC(srcLane >= 0);
124
125 /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
126 subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
127
128 Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
129 The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
130 first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
131 auto const actual_group = warp.m_item_warp.get_sub_group();
132 std::uint32_t const w = static_cast<std::uint32_t>(width);
133 std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
134 return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
135 }
136 };
137
138 template<typename TDim>
139 struct ShflUp<warp::WarpGenericSycl<TDim>>
140 {
141 template<typename T>
142 static auto shfl_up(
143 warp::WarpGenericSycl<TDim> const& warp,
144 T value,
145 std::uint32_t offset, /* must be the same for all work-items in the group */
146 std::int32_t width)
147 {
148 auto const actual_group = warp.m_item_warp.get_sub_group();
149 std::uint32_t const w = static_cast<std::uint32_t>(width);
150 std::uint32_t const id = actual_group.get_local_linear_id();
151 std::uint32_t const start_index = id / w * w;
152 T result = sycl::shift_group_right(actual_group, value, offset);
153 if((id - start_index) < offset)
154 {
155 result = value;
156 }
157 return result;
158 }
159 };
160
161 template<typename TDim>
162 struct ShflDown<warp::WarpGenericSycl<TDim>>
163 {
164 template<typename T>
165 static auto shfl_down(
166 warp::WarpGenericSycl<TDim> const& warp,
167 T value,
168 std::uint32_t offset,
169 std::int32_t width)
170 {
171 auto const actual_group = warp.m_item_warp.get_sub_group();
172 std::uint32_t const w = static_cast<std::uint32_t>(width);
173 std::uint32_t const id = actual_group.get_local_linear_id();
174 std::uint32_t const end_index = (id / w + 1) * w;
175 T result = sycl::shift_group_left(actual_group, value, offset);
176 if((id + offset) >= end_index)
177 {
178 result = value;
179 }
180 return result;
181 }
182 };
183
184 template<typename TDim>
185 struct ShflXor<warp::WarpGenericSycl<TDim>>
186 {
187 template<typename T>
188 static auto shfl_xor(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t mask, std::int32_t width)
189 {
190 auto const actual_group = warp.m_item_warp.get_sub_group();
191 std::uint32_t const w = static_cast<std::uint32_t>(width);
192 std::uint32_t const id = actual_group.get_local_linear_id();
193 std::uint32_t const start_index = id / w * w;
194 std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
195 return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
196 }
197 };
198} // namespace alpaka::warp::trait
199
200#endif
#define ALPAKA_ASSERT_ACC(...)
ALPAKA_ASSERT_ACC is an assert-like macro.
Definition Assert.hpp:52
The warp traits.
Definition Traits.hpp:21
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:164
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:236
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:90
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:137
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:274
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:65
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:312
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:198
STL namespace.