alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpGenericSycl.hpp
Go to the documentation of this file.
1/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 *
4 * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
5 * from Intel DPCT.
6 * Copyright (C) Intel Corporation.
7 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 * See https://llvm.org/LICENSE.txt for license information.
9 */
10
11#pragma once
12
15
16#include <cstdint>
17
18#ifdef ALPAKA_ACC_SYCL_ENABLED
19
20# include <sycl/sycl.hpp>
21
22namespace alpaka::warp
23{
24 //! The SYCL warp.
25 template<typename TDim>
26 class WarpGenericSycl : public interface::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
27 {
28 public:
29 WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
30 {
31 }
32
33 sycl::nd_item<TDim::value> m_item_warp;
34 };
35} // namespace alpaka::warp
36
37namespace alpaka::warp::trait
38{
39 template<typename TDim>
40 struct GetSize<warp::WarpGenericSycl<TDim>>
41 {
42 static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
43 {
44 auto const sub_group = warp.m_item_warp.get_sub_group();
45 // SYCL sub-groups are always 1D
46 return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
47 }
48 };
49
50 template<typename TDim>
51 struct GetSizeCompileTime<warp::WarpGenericSycl<TDim>>
52 {
53 static constexpr auto getSizeCompileTime() -> std::int32_t
54 {
55 // SYCL sub-groups size is usually not known at compile time
56 return 0;
57 }
58 };
59
60 template<typename TDim>
61 struct GetSizeUpperLimit<warp::WarpGenericSycl<TDim>>
62 {
63 static constexpr auto getSizeUpperLimit() -> std::int32_t
64 {
65 // See include/alpaka/kernel/SyclSubgroupSize.hpp for possible sub-group sizes.
66 return 64;
67 }
68 };
69
70 template<typename TDim>
71 struct Activemask<warp::WarpGenericSycl<TDim>>
72 {
73 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
74 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
75 // Restrict to warpSize <= 32 for now.
76 static auto activemask(warp::WarpGenericSycl<TDim> const& /*warp*/) -> std::uint32_t
77 {
78 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
79 auto const mask = sycl::ext::oneapi::group_ballot(sg, true);
80 std::uint32_t bits = 0;
81 mask.extract_bits(bits);
82 return bits;
83 }
84 };
85
86 template<typename TDim>
87 struct All<warp::WarpGenericSycl<TDim>>
88 {
89 static auto all(warp::WarpGenericSycl<TDim> const& /*warp*/, std::int32_t predicate) -> std::int32_t
90 {
91 auto activegroup = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
92 return static_cast<std::int32_t>(sycl::all_of_group(activegroup, static_cast<bool>(predicate)));
93 }
94 };
95
96 template<typename TDim>
97 struct Any<warp::WarpGenericSycl<TDim>>
98 {
99 static auto any(warp::WarpGenericSycl<TDim> const& /*warp*/, std::int32_t predicate) -> std::int32_t
100 {
101 auto activegroup = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
102 return static_cast<std::int32_t>(sycl::any_of_group(activegroup, static_cast<bool>(predicate)));
103 }
104 };
105
106 template<typename TDim>
107 struct Ballot<warp::WarpGenericSycl<TDim>>
108 {
109 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
110 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
111 // Restrict to warpSize <= 32 for now.
112 static auto ballot(warp::WarpGenericSycl<TDim> const& /*warp*/, std::int32_t predicate) -> std::uint32_t
113 {
114 auto sub_group = sycl::ext::oneapi::this_work_item::get_sub_group();
115 auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
116 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
117 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
118 // Restrict to warpSize <= 32 for now.
119 std::uint32_t bits = 0;
120 mask.extract_bits(bits);
121 return bits;
122 }
123 };
124
125 template<typename TDim>
126 struct Shfl<warp::WarpGenericSycl<TDim>>
127 {
128 template<typename T>
129 static auto shfl(
130 warp::WarpGenericSycl<TDim> const& /*warp*/,
131 T value,
132 std::int32_t srcLane,
133 std::int32_t width)
134 {
135 ALPAKA_ASSERT_ACC(width > 0);
136 ALPAKA_ASSERT_ACC(srcLane >= 0);
137
138 /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
139 subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
140
141 Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
142 The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
143 first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
144 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
145 std::uint32_t const w = static_cast<std::uint32_t>(width);
146 std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
147 return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
148 }
149 };
150
151 template<typename TDim>
152 struct ShflUp<warp::WarpGenericSycl<TDim>>
153 {
154 template<typename T>
155 static auto shfl_up(
156 warp::WarpGenericSycl<TDim> const& /*warp*/,
157 T value,
158 std::uint32_t offset, /* must be the same for all work-items in the group */
159 std::int32_t width)
160 {
161 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
162 std::uint32_t const w = static_cast<std::uint32_t>(width);
163 std::uint32_t const id = actual_group.get_local_linear_id();
164 std::uint32_t const start_index = id / w * w;
165 T result = sycl::shift_group_right(actual_group, value, offset);
166 if((id - start_index) < offset)
167 {
168 result = value;
169 }
170 return result;
171 }
172 };
173
174 template<typename TDim>
175 struct ShflDown<warp::WarpGenericSycl<TDim>>
176 {
177 template<typename T>
178 static auto shfl_down(
179 warp::WarpGenericSycl<TDim> const& /*warp*/,
180 T value,
181 std::uint32_t offset,
182 std::int32_t width)
183 {
184 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
185 std::uint32_t const w = static_cast<std::uint32_t>(width);
186 std::uint32_t const id = actual_group.get_local_linear_id();
187 std::uint32_t const end_index = (id / w + 1) * w;
188 T result = sycl::shift_group_left(actual_group, value, offset);
189 if((id + offset) >= end_index)
190 {
191 result = value;
192 }
193 return result;
194 }
195 };
196
197 template<typename TDim>
198 struct ShflXor<warp::WarpGenericSycl<TDim>>
199 {
200 template<typename T>
201 static auto shfl_xor(
202 warp::WarpGenericSycl<TDim> const& /*warp*/,
203 T value,
204 std::int32_t mask,
205 std::int32_t width)
206 {
207 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
208 std::uint32_t const w = static_cast<std::uint32_t>(width);
209 std::uint32_t const id = actual_group.get_local_linear_id();
210 std::uint32_t const start_index = id / w * w;
211 std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
212 return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
213 }
214 };
215} // namespace alpaka::warp::trait
216
217#endif
#define ALPAKA_ASSERT_ACC(...)
ALPAKA_ASSERT_ACC is an assert-like macro.
Definition Assert.hpp:52
The warp traits.
Definition Traits.hpp:21
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:145
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:195
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:267
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:168
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:305
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:343
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:229