alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelGenericSycl.hpp
Go to the documentation of this file.
1/* Copyright 2024 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
10#include "alpaka/core/Sycl.hpp"
11#include "alpaka/dev/Traits.hpp"
12#include "alpaka/dim/Traits.hpp"
13#include "alpaka/idx/Traits.hpp"
21
22#include <cassert>
23#include <functional>
24#include <memory>
25#include <stdexcept>
26#include <tuple>
27#include <type_traits>
28#include <utility>
29
30#ifdef ALPAKA_ACC_SYCL_ENABLED
31
32# if BOOST_COMP_CLANG
33# pragma clang diagnostic push
34# pragma clang diagnostic ignored "-Wunused-lambda-capture"
35# pragma clang diagnostic ignored "-Wunused-parameter"
36# endif
37
38# include <sycl/sycl.hpp>
39
40# define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \
41 cgh.parallel_for( \
42 sycl::nd_range<TDim::value>{global_size, local_size}, \
43 [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
44 sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]] \
45 { \
46 auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
47 std::apply( \
48 [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
49 k_args); \
50 });
51
52# define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE \
53 cgh.parallel_for( \
54 sycl::nd_range<TDim::value>{global_size, local_size}, \
55 [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
56 sycl::nd_item<TDim::value> work_item) \
57 { \
58 auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
59 std::apply( \
60 [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
61 k_args); \
62 });
63
64# define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL \
65 throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); \
66 cgh.parallel_for( \
67 sycl::nd_range<TDim::value>{global_size, local_size}, \
68 [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
69 sycl::nd_item<TDim::value> work_item) {});
70
71namespace alpaka
72{
73 //! The SYCL accelerator execution task.
74 template<concepts::Tag TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
75 class TaskKernelGenericSycl final : public WorkDivMembers<TDim, TIdx>
76 {
77 public:
78 static_assert(TDim::value > 0 && TDim::value <= 3, "Invalid kernel dimensionality");
79
80 template<typename TWorkDiv>
81 TaskKernelGenericSycl(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
82 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
83 , m_kernelFnObj{kernelFnObj}
84 , m_args{std::forward<TArgs>(args)...}
85 {
86 }
87
88 auto operator()(sycl::handler& cgh) const -> void
89 {
90 auto const work_groups = WorkDivMembers<TDim, TIdx>::m_gridBlockExtent;
91 auto const group_items = WorkDivMembers<TDim, TIdx>::m_blockThreadExtent;
92 auto const item_elements = WorkDivMembers<TDim, TIdx>::m_threadElemExtent;
93
94 auto const global_size = get_global_size(work_groups, group_items);
95 auto const local_size = get_local_size(group_items);
96
97 // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
98 auto const dyn_shared_mem_bytes = std::max(
99 1ul,
100 std::apply(
101 [&](std::decay_t<TArgs> const&... args) {
102 return getBlockSharedMemDynSizeBytes<TAcc>(m_kernelFnObj, group_items, item_elements, args...);
103 },
104 m_args));
105
106 auto dyn_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{dyn_shared_mem_bytes}, cgh};
107
108 // allocate static shared memory -- value comes from the build system
109 constexpr auto st_shared_mem_bytes = std::size_t{ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB * 1024};
110 auto st_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
111
112 // copy-by-value so we don't access 'this' on the device
113 auto k_func = m_kernelFnObj;
114 auto k_args = m_args;
115
116 constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
117 bool supported = false;
118
119 if constexpr(sub_group_size == 0)
120 {
121 // no explicit subgroup size requirement
122 LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
123 supported = true;
124 }
125 else
126 {
127# if(SYCL_SUBGROUP_SIZE == 0)
128 // no explicit SYCL target, assume JIT compilation
129 LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
130 supported = true;
131# else
132 // check if the kernel should be launched with a subgroup size of 4
133 if constexpr(sub_group_size == 4)
134 {
135# if(SYCL_SUBGROUP_SIZE & 4)
136 LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
137 supported = true;
138# else
139 // empty kernel, required to keep SYCL happy
140 THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
141# endif
142 }
143
144 // check if the kernel should be launched with a subgroup size of 8
145 if constexpr(sub_group_size == 8)
146 {
147# if(SYCL_SUBGROUP_SIZE & 8)
148 LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
149 supported = true;
150# else
151 // empty kernel, required to keep SYCL happy
152 THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
153# endif
154 }
155
156 // check if the kernel should be launched with a subgroup size of 16
157 if constexpr(sub_group_size == 16)
158 {
159# if(SYCL_SUBGROUP_SIZE & 16)
160 LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
161 supported = true;
162# else
163 // empty kernel, required to keep SYCL happy
164 THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
165# endif
166 }
167
168 // check if the kernel should be launched with a subgroup size of 32
169 if constexpr(sub_group_size == 32)
170 {
171# if(SYCL_SUBGROUP_SIZE & 32)
172 LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
173 supported = true;
174# else
175 // empty kernel, required to keep SYCL happy
176 THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
177# endif
178 }
179
180 // check if the kernel should be launched with a subgroup size of 64
181 if constexpr(sub_group_size == 64)
182 {
183# if(SYCL_SUBGROUP_SIZE & 64)
184 LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
185 supported = true;
186# else
187 // empty kernel, required to keep SYCL happy
188 THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
189# endif
190 }
191# endif
192
193 // this subgroup size is not supported, raise an exception
194 if(not supported)
195 throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
196 }
197 }
198
199 static constexpr auto is_sycl_task = true;
200 // Distinguish from other tasks
201 static constexpr auto is_sycl_kernel = true;
202
203 private:
204 auto get_global_size(Vec<TDim, TIdx> const& work_groups, Vec<TDim, TIdx> const& group_items) const
205 {
206 if constexpr(TDim::value == 1)
207 return sycl::range<1>{static_cast<std::size_t>(work_groups[0] * group_items[0])};
208 else if constexpr(TDim::value == 2)
209 return sycl::range<2>{
210 static_cast<std::size_t>(work_groups[1] * group_items[1]),
211 static_cast<std::size_t>(work_groups[0] * group_items[0])};
212 else
213 return sycl::range<3>{
214 static_cast<std::size_t>(work_groups[2] * group_items[2]),
215 static_cast<std::size_t>(work_groups[1] * group_items[1]),
216 static_cast<std::size_t>(work_groups[0] * group_items[0])};
217 }
218
219 auto get_local_size(Vec<TDim, TIdx> const& group_items) const
220 {
221 if constexpr(TDim::value == 1)
222 return sycl::range<1>{static_cast<std::size_t>(group_items[0])};
223 else if constexpr(TDim::value == 2)
224 return sycl::range<2>{
225 static_cast<std::size_t>(group_items[1]),
226 static_cast<std::size_t>(group_items[0])};
227 else
228 return sycl::range<3>{
229 static_cast<std::size_t>(group_items[2]),
230 static_cast<std::size_t>(group_items[1]),
231 static_cast<std::size_t>(group_items[0])};
232 }
233
234 public:
235 TKernelFnObj m_kernelFnObj;
236 std::tuple<std::decay_t<TArgs>...> m_args;
237 };
238
239} // namespace alpaka
240
241# if BOOST_COMP_CLANG
242# pragma clang diagnostic pop
243# endif
244
245namespace alpaka::trait
246{
247 //! The SYCL execution task accelerator type trait specialization.
248 template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
249 struct AccType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
250 {
251 using type = TAcc;
252 };
253
254 //! The SYCL execution task device type trait specialization.
255 template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
256 struct DevType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
257 {
258 using type = typename DevType<TAcc>::type;
259 };
260
261 //! The SYCL execution task platform type trait specialization.
262 template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
263 struct PlatformType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
264 {
265 using type = typename PlatformType<TAcc>::type;
266 };
267
268 //! The SYCL execution task dimension getter trait specialization.
269 template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
270 struct DimType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
271 {
272 using type = TDim;
273 };
274
275 //! The SYCL execution task idx type trait specialization.
276 template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
277 struct IdxType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
278 {
279 using type = TIdx;
280 };
281
282 //! \brief Specialisation of the class template FunctionAttributes
283 //! \tparam TTag The SYCL device selector.
284 //! \tparam TDev The device type.
285 //! \tparam TDim The dimensionality of the accelerator device properties.
286 //! \tparam TIdx The idx type of the accelerator device properties.
287 //! \tparam TKernelFn Kernel function object type.
288 //! \tparam TArgs Kernel function object argument types as a parameter pack.
289 template<concepts::Tag TTag, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
290 struct FunctionAttributes<AccGenericSycl<TTag, TDim, TIdx>, TDev, TKernelFn, TArgs...>
291 {
292 //! \param dev The device instance
293 //! \param kernelFn The kernel function object which should be executed.
294 //! \param args The kernel invocation arguments.
295 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
296 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
298 TDev const& dev,
299 [[maybe_unused]] TKernelFn const& kernelFn,
300 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
301 {
302 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
303
304 // set function properties for maxThreadsPerBlock to device properties
305 auto const& props = alpaka::getAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>(dev);
306 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
307 return kernelFunctionAttributes;
308 }
309 };
310} // namespace alpaka::trait
311
312# undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
313
314#endif
#define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The accelerator traits.
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC WorkDivMembers(alpaka::Vec< TDim, TIdx > const &gridBlockExtent, alpaka::Vec< TDim, TIdx > const &blockThreadExtent, alpaka::Vec< TDim, TIdx > const &elemExtent) -> WorkDivMembers< TDim, TIdx >
Deduction guide for the constructor which can be called without explicit template type parameters.
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFnObj const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
Definition Traits.hpp:85