alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WarpGenericSycl.hpp
Go to the documentation of this file.
1/* Copyright 2026 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego, Simone Balducci
2 * SPDX-License-Identifier: MPL-2.0
3 *
4 * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
5 * from Intel DPCT.
6 * Copyright (C) Intel Corporation.
7 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 * See https://llvm.org/LICENSE.txt for license information.
9 */
10
11#pragma once
12
15
16#include <cstdint>
17
18#ifdef ALPAKA_ACC_SYCL_ENABLED
19
20# include <sycl/sycl.hpp>
21
22namespace alpaka::warp
23{
24 //! The SYCL warp.
25 template<typename TDim>
26 class WarpGenericSycl : public interface::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
27 {
28 public:
29 using mask_type = std::uint32_t;
30
31 WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
32 {
33 }
34
35 sycl::nd_item<TDim::value> m_item_warp;
36 };
37} // namespace alpaka::warp
38
39namespace alpaka::warp::trait
40{
41
42 template<typename TDim>
43 struct GetSize<warp::WarpGenericSycl<TDim>>
44 {
45 static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
46 {
47 auto const sub_group = warp.m_item_warp.get_sub_group();
48 // SYCL sub-groups are always 1D
49 return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
50 }
51 };
52
53 template<typename TDim>
54 struct GetSizeCompileTime<warp::WarpGenericSycl<TDim>>
55 {
56 static constexpr auto getSizeCompileTime() -> std::int32_t
57 {
58 // SYCL sub-groups size is usually not known at compile time
59 return 0;
60 }
61 };
62
63 template<typename TDim>
64 struct GetSizeUpperLimit<warp::WarpGenericSycl<TDim>>
65 {
66 static constexpr auto getSizeUpperLimit() -> std::int32_t
67 {
68 // See include/alpaka/kernel/SyclSubgroupSize.hpp for possible sub-group sizes.
69 return 64;
70 }
71 };
72
73 template<typename TDim>
74 struct Activemask<warp::WarpGenericSycl<TDim>>
75 {
76 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
77 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
78 // Restrict to warpSize <= 32 for now.
79 static auto activemask(warp::WarpGenericSycl<TDim> const& /*warp*/) -> warp::WarpGenericSycl<TDim>::mask_type
80 {
81 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
82 auto const mask = sycl::ext::oneapi::group_ballot(sg, true);
83 std::uint32_t bits = 0;
84 mask.extract_bits(bits);
85 return bits;
86 }
87 };
88
89 template<typename TDim>
90 struct All<warp::WarpGenericSycl<TDim>>
91 {
92 static auto all(warp::WarpGenericSycl<TDim> const& /*warp*/, std::int32_t predicate) -> std::int32_t
93 {
94 auto activegroup = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
95 return static_cast<std::int32_t>(sycl::all_of_group(activegroup, static_cast<bool>(predicate)));
96 }
97 };
98
99 template<typename TDim>
100 struct Any<warp::WarpGenericSycl<TDim>>
101 {
102 static auto any(warp::WarpGenericSycl<TDim> const& /*warp*/, std::int32_t predicate) -> std::int32_t
103 {
104 auto activegroup = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
105 return static_cast<std::int32_t>(sycl::any_of_group(activegroup, static_cast<bool>(predicate)));
106 }
107 };
108
109 template<typename TDim>
110 struct Ballot<warp::WarpGenericSycl<TDim>>
111 {
112 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
113 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
114 // Restrict to warpSize <= 32 for now.
115 static auto ballot(warp::WarpGenericSycl<TDim> const& /*warp*/, std::int32_t predicate)
116 -> warp::WarpGenericSycl<TDim>::mask_type
117 {
118 auto sub_group = sycl::ext::oneapi::this_work_item::get_sub_group();
119 auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
120 // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
121 // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
122 // Restrict to warpSize <= 32 for now.
123 std::uint32_t bits = 0;
124 mask.extract_bits(bits);
125 return bits;
126 }
127 };
128
129 template<typename TDim>
130 struct Shfl<warp::WarpGenericSycl<TDim>>
131 {
132 template<typename T>
133 static auto shfl(
134 warp::WarpGenericSycl<TDim> const& /*warp*/,
135 T value,
136 std::int32_t srcLane,
137 std::int32_t width)
138 {
139 ALPAKA_ASSERT_ACC(width > 0);
140 ALPAKA_ASSERT_ACC(srcLane >= 0);
141
142 /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
143 subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
144
145 Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
146 The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
147 first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
148 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
149 std::uint32_t const w = static_cast<std::uint32_t>(width);
150 std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
151 return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
152 }
153 };
154
155 template<typename TDim>
156 struct ShflUp<warp::WarpGenericSycl<TDim>>
157 {
158 template<typename T>
159 static auto shfl_up(
160 warp::WarpGenericSycl<TDim> const& /*warp*/,
161 T value,
162 std::uint32_t offset, /* must be the same for all work-items in the group */
163 std::int32_t width)
164 {
165 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
166 std::uint32_t const w = static_cast<std::uint32_t>(width);
167 std::uint32_t const id = actual_group.get_local_linear_id();
168 std::uint32_t const start_index = id / w * w;
169 T result = sycl::shift_group_right(actual_group, value, offset);
170 if((id - start_index) < offset)
171 {
172 result = value;
173 }
174 return result;
175 }
176 };
177
178 template<typename TDim>
179 struct ShflDown<warp::WarpGenericSycl<TDim>>
180 {
181 template<typename T>
182 static auto shfl_down(
183 warp::WarpGenericSycl<TDim> const& /*warp*/,
184 T value,
185 std::uint32_t offset,
186 std::int32_t width)
187 {
188 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
189 std::uint32_t const w = static_cast<std::uint32_t>(width);
190 std::uint32_t const id = actual_group.get_local_linear_id();
191 std::uint32_t const end_index = (id / w + 1) * w;
192 T result = sycl::shift_group_left(actual_group, value, offset);
193 if((id + offset) >= end_index)
194 {
195 result = value;
196 }
197 return result;
198 }
199 };
200
201 template<typename TDim>
202 struct ShflXor<warp::WarpGenericSycl<TDim>>
203 {
204 template<typename T>
205 static auto shfl_xor(
206 warp::WarpGenericSycl<TDim> const& /*warp*/,
207 T value,
208 std::int32_t mask,
209 std::int32_t width)
210 {
211 auto actual_group = sycl::ext::oneapi::experimental::this_kernel::get_opportunistic_group();
212 std::uint32_t const w = static_cast<std::uint32_t>(width);
213 std::uint32_t const id = actual_group.get_local_linear_id();
214 std::uint32_t const start_index = id / w * w;
215 std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
216 return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
217 }
218 };
219} // namespace alpaka::warp::trait
220
221#endif
#define ALPAKA_ASSERT_ACC(...)
ALPAKA_ASSERT_ACC is an assert-like macro.
Definition Assert.hpp:52
The warp traits.
Definition Traits.hpp:21
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate) -> typename TWarp::mask_type
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition Traits.hpp:194
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:144
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition Traits.hpp:266
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
Definition Traits.hpp:96
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition Traits.hpp:167
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition Traits.hpp:304
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition Traits.hpp:73
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition Traits.hpp:342
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> typename TWarp::mask_type
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
Definition Traits.hpp:84
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition Traits.hpp:228