alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelGenericSycl.hpp
Go to the documentation of this file.
1 /* Copyright 2024 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
8 #include "alpaka/acc/Traits.hpp"
10 #include "alpaka/core/Sycl.hpp"
11 #include "alpaka/dev/Traits.hpp"
12 #include "alpaka/dim/Traits.hpp"
13 #include "alpaka/idx/Traits.hpp"
16 #include "alpaka/kernel/Traits.hpp"
19 #include "alpaka/queue/Traits.hpp"
21 
22 #include <cassert>
23 #include <functional>
24 #include <memory>
25 #include <stdexcept>
26 #include <tuple>
27 #include <type_traits>
28 #include <utility>
29 
30 #ifdef ALPAKA_ACC_SYCL_ENABLED
31 
32 # if BOOST_COMP_CLANG
33 # pragma clang diagnostic push
34 # pragma clang diagnostic ignored "-Wunused-lambda-capture"
35 # pragma clang diagnostic ignored "-Wunused-parameter"
36 # endif
37 
38 # include <sycl/sycl.hpp>
39 
40 # define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \
41  cgh.parallel_for( \
42  sycl::nd_range<TDim::value>{global_size, local_size}, \
43  [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
44  sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]] \
45  { \
46  auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
47  std::apply( \
48  [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
49  k_args); \
50  });
51 
52 # define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE \
53  cgh.parallel_for( \
54  sycl::nd_range<TDim::value>{global_size, local_size}, \
55  [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
56  sycl::nd_item<TDim::value> work_item) \
57  { \
58  auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
59  std::apply( \
60  [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
61  k_args); \
62  });
63 
64 # define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL \
65  throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); \
66  cgh.parallel_for( \
67  sycl::nd_range<TDim::value>{global_size, local_size}, \
68  [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
69  sycl::nd_item<TDim::value> work_item) {});
70 
71 namespace alpaka
72 {
73  //! The SYCL accelerator execution task.
74  template<concepts::Tag TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
75  class TaskKernelGenericSycl final : public WorkDivMembers<TDim, TIdx>
76  {
77  public:
78  static_assert(TDim::value > 0 && TDim::value <= 3, "Invalid kernel dimensionality");
79 
80  template<typename TWorkDiv>
81  TaskKernelGenericSycl(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
82  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
83  , m_kernelFnObj{kernelFnObj}
84  , m_args{std::forward<TArgs>(args)...}
85  {
86  }
87 
88  auto operator()(sycl::handler& cgh) const -> void
89  {
90  auto const work_groups = WorkDivMembers<TDim, TIdx>::m_gridBlockExtent;
91  auto const group_items = WorkDivMembers<TDim, TIdx>::m_blockThreadExtent;
92  auto const item_elements = WorkDivMembers<TDim, TIdx>::m_threadElemExtent;
93 
94  auto const global_size = get_global_size(work_groups, group_items);
95  auto const local_size = get_local_size(group_items);
96 
97  // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
98  auto const dyn_shared_mem_bytes = std::max(
99  1ul,
100  std::apply(
101  [&](std::decay_t<TArgs> const&... args) {
102  return getBlockSharedMemDynSizeBytes<TAcc>(m_kernelFnObj, group_items, item_elements, args...);
103  },
104  m_args));
105 
106  auto dyn_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{dyn_shared_mem_bytes}, cgh};
107 
108  // allocate static shared memory -- value comes from the build system
109  constexpr auto st_shared_mem_bytes = std::size_t{ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB * 1024};
110  auto st_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
111 
112  // copy-by-value so we don't access 'this' on the device
113  auto k_func = m_kernelFnObj;
114  auto k_args = m_args;
115 
116  constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
117  bool supported = false;
118 
119  if constexpr(sub_group_size == 0)
120  {
121  // no explicit subgroup size requirement
122  LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
123  supported = true;
124  }
125  else
126  {
127 # if(SYCL_SUBGROUP_SIZE == 0)
128  // no explicit SYCL target, assume JIT compilation
129  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
130  supported = true;
131 # else
132  // check if the kernel should be launched with a subgroup size of 4
133  if constexpr(sub_group_size == 4)
134  {
135 # if(SYCL_SUBGROUP_SIZE & 4)
136  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
137  supported = true;
138 # else
139  // empty kernel, required to keep SYCL happy
140  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
141 # endif
142  }
143 
144  // check if the kernel should be launched with a subgroup size of 8
145  if constexpr(sub_group_size == 8)
146  {
147 # if(SYCL_SUBGROUP_SIZE & 8)
148  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
149  supported = true;
150 # else
151  // empty kernel, required to keep SYCL happy
152  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
153 # endif
154  }
155 
156  // check if the kernel should be launched with a subgroup size of 16
157  if constexpr(sub_group_size == 16)
158  {
159 # if(SYCL_SUBGROUP_SIZE & 16)
160  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
161  supported = true;
162 # else
163  // empty kernel, required to keep SYCL happy
164  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
165 # endif
166  }
167 
168  // check if the kernel should be launched with a subgroup size of 32
169  if constexpr(sub_group_size == 32)
170  {
171 # if(SYCL_SUBGROUP_SIZE & 32)
172  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
173  supported = true;
174 # else
175  // empty kernel, required to keep SYCL happy
176  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
177 # endif
178  }
179 
180  // check if the kernel should be launched with a subgroup size of 64
181  if constexpr(sub_group_size == 64)
182  {
183 # if(SYCL_SUBGROUP_SIZE & 64)
184  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
185  supported = true;
186 # else
187  // empty kernel, required to keep SYCL happy
188  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
189 # endif
190  }
191 # endif
192 
193  // this subgroup size is not supported, raise an exception
194  if(not supported)
195  throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
196  }
197  }
198 
199  static constexpr auto is_sycl_task = true;
200  // Distinguish from other tasks
201  static constexpr auto is_sycl_kernel = true;
202 
203  private:
204  auto get_global_size(Vec<TDim, TIdx> const& work_groups, Vec<TDim, TIdx> const& group_items) const
205  {
206  if constexpr(TDim::value == 1)
207  return sycl::range<1>{static_cast<std::size_t>(work_groups[0] * group_items[0])};
208  else if constexpr(TDim::value == 2)
209  return sycl::range<2>{
210  static_cast<std::size_t>(work_groups[1] * group_items[1]),
211  static_cast<std::size_t>(work_groups[0] * group_items[0])};
212  else
213  return sycl::range<3>{
214  static_cast<std::size_t>(work_groups[2] * group_items[2]),
215  static_cast<std::size_t>(work_groups[1] * group_items[1]),
216  static_cast<std::size_t>(work_groups[0] * group_items[0])};
217  }
218 
219  auto get_local_size(Vec<TDim, TIdx> const& group_items) const
220  {
221  if constexpr(TDim::value == 1)
222  return sycl::range<1>{static_cast<std::size_t>(group_items[0])};
223  else if constexpr(TDim::value == 2)
224  return sycl::range<2>{
225  static_cast<std::size_t>(group_items[1]),
226  static_cast<std::size_t>(group_items[0])};
227  else
228  return sycl::range<3>{
229  static_cast<std::size_t>(group_items[2]),
230  static_cast<std::size_t>(group_items[1]),
231  static_cast<std::size_t>(group_items[0])};
232  }
233 
234  public:
235  TKernelFnObj m_kernelFnObj;
236  std::tuple<std::decay_t<TArgs>...> m_args;
237  };
238 
239 } // namespace alpaka
240 
241 # if BOOST_COMP_CLANG
242 # pragma clang diagnostic pop
243 # endif
244 
245 namespace alpaka::trait
246 {
247  //! The SYCL execution task accelerator type trait specialization.
248  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
249  struct AccType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
250  {
251  using type = TAcc;
252  };
253 
254  //! The SYCL execution task device type trait specialization.
255  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
256  struct DevType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
257  {
258  using type = typename DevType<TAcc>::type;
259  };
260 
261  //! The SYCL execution task platform type trait specialization.
262  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
263  struct PlatformType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
264  {
265  using type = typename PlatformType<TAcc>::type;
266  };
267 
268  //! The SYCL execution task dimension getter trait specialization.
269  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
270  struct DimType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
271  {
272  using type = TDim;
273  };
274 
275  //! The SYCL execution task idx type trait specialization.
276  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
277  struct IdxType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
278  {
279  using type = TIdx;
280  };
281 
282  //! \brief Specialisation of the class template FunctionAttributes
283  //! \tparam TTag The SYCL device selector.
284  //! \tparam TDev The device type.
285  //! \tparam TDim The dimensionality of the accelerator device properties.
286  //! \tparam TIdx The idx type of the accelerator device properties.
287  //! \tparam TKernelFn Kernel function object type.
288  //! \tparam TArgs Kernel function object argument types as a parameter pack.
289  template<concepts::Tag TTag, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
290  struct FunctionAttributes<AccGenericSycl<TTag, TDim, TIdx>, TDev, TKernelFn, TArgs...>
291  {
292  //! \param dev The device instance
293  //! \param kernelFn The kernel function object which should be executed.
294  //! \param args The kernel invocation arguments.
295  //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
296  //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
298  TDev const& dev,
299  [[maybe_unused]] TKernelFn const& kernelFn,
300  [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
301  {
302  alpaka::KernelFunctionAttributes kernelFunctionAttributes;
303 
304  // set function properties for maxThreadsPerBlock to device properties
305  auto const& props = alpaka::getAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>(dev);
306  kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
307  return kernelFunctionAttributes;
308  }
309  };
310 } // namespace alpaka::trait
311 
312 # undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
313 
314 #endif
#define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
Vec< TDim, TIdx > m_blockThreadExtent
Vec< TDim, TIdx > m_gridBlockExtent
Vec< TDim, TIdx > m_threadElemExtent
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
concept Tag
Definition: Tag.hpp:46
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto max(T const &max_ctx, Tx const &x, Ty const &y)
Returns the larger of two arguments. NaNs are treated as missing data (between a NaN and a numeric va...
Definition: Traits.hpp:1263
The accelerator traits.
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC WorkDivMembers(alpaka::Vec< TDim, TIdx > const &gridBlockExtent, alpaka::Vec< TDim, TIdx > const &blockThreadExtent, alpaka::Vec< TDim, TIdx > const &elemExtent) -> WorkDivMembers< TDim, TIdx >
Deduction guide for the constructor which can be called without explicit template type parameters.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
static ALPAKA_FN_HOST auto getFunctionAttributes([[maybe_unused]] TDev const &dev, [[maybe_unused]] TKernelFnObj const &kernelFn, [[maybe_unused]] TArgs &&... args) -> alpaka::KernelFunctionAttributes
Definition: Traits.hpp:85