alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelGenericSycl.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/acc/Traits.hpp"
10 #include "alpaka/core/Sycl.hpp"
11 #include "alpaka/dev/Traits.hpp"
12 #include "alpaka/dim/Traits.hpp"
13 #include "alpaka/idx/Traits.hpp"
15 #include "alpaka/kernel/Traits.hpp"
17 #include "alpaka/queue/Traits.hpp"
19 
20 #include <cassert>
21 #include <functional>
22 #include <memory>
23 #include <stdexcept>
24 #include <tuple>
25 #include <type_traits>
26 #include <utility>
27 
28 #ifdef ALPAKA_ACC_SYCL_ENABLED
29 
30 # if BOOST_COMP_CLANG
31 # pragma clang diagnostic push
32 # pragma clang diagnostic ignored "-Wunused-lambda-capture"
33 # pragma clang diagnostic ignored "-Wunused-parameter"
34 # endif
35 
36 # include <sycl/sycl.hpp>
37 
38 # define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \
39  cgh.parallel_for( \
40  sycl::nd_range<TDim::value>{global_size, local_size}, \
41  [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
42  sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]] \
43  { \
44  auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
45  std::apply( \
46  [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
47  k_args); \
48  });
49 
50 # define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE \
51  cgh.parallel_for( \
52  sycl::nd_range<TDim::value>{global_size, local_size}, \
53  [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
54  sycl::nd_item<TDim::value> work_item) \
55  { \
56  auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
57  std::apply( \
58  [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
59  k_args); \
60  });
61 
62 # define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL \
63  throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); \
64  cgh.parallel_for( \
65  sycl::nd_range<TDim::value>{global_size, local_size}, \
66  [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
67  sycl::nd_item<TDim::value> work_item) {});
68 
69 namespace alpaka
70 {
71  //! The SYCL accelerator execution task.
72  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
73  class TaskKernelGenericSycl final : public WorkDivMembers<TDim, TIdx>
74  {
75  public:
76  static_assert(TDim::value > 0 && TDim::value <= 3, "Invalid kernel dimensionality");
77 
78  template<typename TWorkDiv>
79  TaskKernelGenericSycl(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
80  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
81  , m_kernelFnObj{kernelFnObj}
82  , m_args{std::forward<TArgs>(args)...}
83  {
84  }
85 
86  auto operator()(sycl::handler& cgh) const -> void
87  {
88  auto const work_groups = WorkDivMembers<TDim, TIdx>::m_gridBlockExtent;
89  auto const group_items = WorkDivMembers<TDim, TIdx>::m_blockThreadExtent;
90  auto const item_elements = WorkDivMembers<TDim, TIdx>::m_threadElemExtent;
91 
92  auto const global_size = get_global_size(work_groups, group_items);
93  auto const local_size = get_local_size(group_items);
94 
95  // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
96  auto const dyn_shared_mem_bytes = std::max(
97  1ul,
98  std::apply(
99  [&](std::decay_t<TArgs> const&... args) {
100  return getBlockSharedMemDynSizeBytes<TAcc>(m_kernelFnObj, group_items, item_elements, args...);
101  },
102  m_args));
103 
104  auto dyn_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{dyn_shared_mem_bytes}, cgh};
105 
106  // allocate static shared memory -- value comes from the build system
107  constexpr auto st_shared_mem_bytes = std::size_t{ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB * 1024};
108  auto st_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
109 
110  // copy-by-value so we don't access 'this' on the device
111  auto k_func = m_kernelFnObj;
112  auto k_args = m_args;
113 
114  constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
115  bool supported = false;
116 
117  if constexpr(sub_group_size == 0)
118  {
119  // no explicit subgroup size requirement
120  LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
121  supported = true;
122  }
123  else
124  {
125 # if(SYCL_SUBGROUP_SIZE == 0)
126  // no explicit SYCL target, assume JIT compilation
127  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
128  supported = true;
129 # else
130  // check if the kernel should be launched with a subgroup size of 4
131  if constexpr(sub_group_size == 4)
132  {
133 # if(SYCL_SUBGROUP_SIZE & 4)
134  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
135  supported = true;
136 # else
137  // empty kernel, required to keep SYCL happy
138  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
139 # endif
140  }
141 
142  // check if the kernel should be launched with a subgroup size of 8
143  if constexpr(sub_group_size == 8)
144  {
145 # if(SYCL_SUBGROUP_SIZE & 8)
146  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
147  supported = true;
148 # else
149  // empty kernel, required to keep SYCL happy
150  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
151 # endif
152  }
153 
154  // check if the kernel should be launched with a subgroup size of 16
155  if constexpr(sub_group_size == 16)
156  {
157 # if(SYCL_SUBGROUP_SIZE & 16)
158  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
159  supported = true;
160 # else
161  // empty kernel, required to keep SYCL happy
162  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
163 # endif
164  }
165 
166  // check if the kernel should be launched with a subgroup size of 32
167  if constexpr(sub_group_size == 32)
168  {
169 # if(SYCL_SUBGROUP_SIZE & 32)
170  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
171  supported = true;
172 # else
173  // empty kernel, required to keep SYCL happy
174  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
175 # endif
176  }
177 
178  // check if the kernel should be launched with a subgroup size of 64
179  if constexpr(sub_group_size == 64)
180  {
181 # if(SYCL_SUBGROUP_SIZE & 64)
182  LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
183  supported = true;
184 # else
185  // empty kernel, required to keep SYCL happy
186  THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
187 # endif
188  }
189 # endif
190 
191  // this subgroup size is not supported, raise an exception
192  if(not supported)
193  throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
194  }
195  }
196 
197  static constexpr auto is_sycl_task = true;
198  // Distinguish from other tasks
199  static constexpr auto is_sycl_kernel = true;
200 
201  private:
202  auto get_global_size(Vec<TDim, TIdx> const& work_groups, Vec<TDim, TIdx> const& group_items) const
203  {
204  if constexpr(TDim::value == 1)
205  return sycl::range<1>{static_cast<std::size_t>(work_groups[0] * group_items[0])};
206  else if constexpr(TDim::value == 2)
207  return sycl::range<2>{
208  static_cast<std::size_t>(work_groups[1] * group_items[1]),
209  static_cast<std::size_t>(work_groups[0] * group_items[0])};
210  else
211  return sycl::range<3>{
212  static_cast<std::size_t>(work_groups[2] * group_items[2]),
213  static_cast<std::size_t>(work_groups[1] * group_items[1]),
214  static_cast<std::size_t>(work_groups[0] * group_items[0])};
215  }
216 
217  auto get_local_size(Vec<TDim, TIdx> const& group_items) const
218  {
219  if constexpr(TDim::value == 1)
220  return sycl::range<1>{static_cast<std::size_t>(group_items[0])};
221  else if constexpr(TDim::value == 2)
222  return sycl::range<2>{
223  static_cast<std::size_t>(group_items[1]),
224  static_cast<std::size_t>(group_items[0])};
225  else
226  return sycl::range<3>{
227  static_cast<std::size_t>(group_items[2]),
228  static_cast<std::size_t>(group_items[1]),
229  static_cast<std::size_t>(group_items[0])};
230  }
231 
232  public:
233  TKernelFnObj m_kernelFnObj;
234  std::tuple<std::decay_t<TArgs>...> m_args;
235  };
236 
237 } // namespace alpaka
238 
239 # if BOOST_COMP_CLANG
240 # pragma clang diagnostic pop
241 # endif
242 
243 namespace alpaka::trait
244 {
245  //! The SYCL execution task accelerator type trait specialization.
246  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
247  struct AccType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
248  {
249  using type = TAcc;
250  };
251 
252  //! The SYCL execution task device type trait specialization.
253  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
254  struct DevType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
255  {
256  using type = typename DevType<TAcc>::type;
257  };
258 
259  //! The SYCL execution task platform type trait specialization.
260  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
261  struct PlatformType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
262  {
263  using type = typename PlatformType<TAcc>::type;
264  };
265 
266  //! The SYCL execution task dimension getter trait specialization.
267  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
268  struct DimType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
269  {
270  using type = TDim;
271  };
272 
273  //! The SYCL execution task idx type trait specialization.
274  template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
275  struct IdxType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
276  {
277  using type = TIdx;
278  };
279 } // namespace alpaka::trait
280 
281 # undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
282 
283 #endif
#define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
Vec< TDim, TIdx > m_blockThreadExtent
Vec< TDim, TIdx > m_gridBlockExtent
Vec< TDim, TIdx > m_threadElemExtent
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto max(T const &max_ctx, Tx const &x, Ty const &y)
Returns the larger of two arguments. NaNs are treated as missing data (between a NaN and a numeric va...
Definition: Traits.hpp:1263
The accelerator traits.
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC WorkDivMembers(alpaka::Vec< TDim, TIdx > const &gridBlockExtent, alpaka::Vec< TDim, TIdx > const &blockThreadExtent, alpaka::Vec< TDim, TIdx > const &elemExtent) -> WorkDivMembers< TDim, TIdx >
Deduction guide for the constructor which can be called without explicit template type parameters.