alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, RenĂ© Widera, Jan Stephan, Andrea Bocci, Bernhard
2  * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
9 #include "alpaka/acc/Traits.hpp"
11 #include "alpaka/core/Cuda.hpp"
12 #include "alpaka/core/Decay.hpp"
14 #include "alpaka/core/Hip.hpp"
17 #include "alpaka/dev/Traits.hpp"
18 #include "alpaka/dim/Traits.hpp"
19 #include "alpaka/idx/Traits.hpp"
21 #include "alpaka/kernel/Traits.hpp"
23 #include "alpaka/queue/Traits.hpp"
27 
28 #include <stdexcept>
29 #include <tuple>
30 #include <type_traits>
31 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
32 # include <iostream>
33 #endif
34 
35 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
36 
37 # if !defined(ALPAKA_HOST_ONLY)
38 
40 
41 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
42 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
43 # endif
44 
45 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
46 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
47 # endif
48 
49 namespace alpaka
50 {
51  namespace detail
52  {
53 # if BOOST_COMP_CLANG
54 # pragma clang diagnostic push
55 # pragma clang diagnostic ignored "-Wunused-template"
56 # endif
57  //! The GPU CUDA/HIP kernel entry point.
58  // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
59  // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
60  // profiler (e.g. ncu) is as shorter as possible.
61  template<typename TKernelFnObj, typename TApi, typename TAcc, typename TDim, typename TIdx, typename... TArgs>
62  __global__ void gpuKernel(
63  Vec<TDim, TIdx> const threadElemExtent,
64  TKernelFnObj const kernelFnObj,
65  TArgs... args)
66  {
67  TAcc const acc(threadElemExtent);
68 
69 // with clang it is not possible to query std::result_of for a pure device lambda created on the host side
70 # if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
71  static_assert(
72  std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
73  "The TKernelFnObj is required to return void!");
74 # endif
75  kernelFnObj(const_cast<TAcc const&>(acc), args...);
76  }
77 # if BOOST_COMP_CLANG
78 # pragma clang diagnostic pop
79 # endif
80  } // namespace detail
81 
82  namespace uniform_cuda_hip
83  {
84  namespace detail
85  {
86  template<typename TDim, typename TIdx>
88  {
89  if constexpr(TDim::value > 0)
90  {
91  for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
92  {
93  if(vec[TDim::value - 1u - i] != 1)
94  {
95  throw std::runtime_error(
96  "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
97  "work division extents of the dimensions higher 3 have to be 1!");
98  }
99  }
100  }
101  }
102 
103  template<typename TDim, typename TIdx>
105  {
106  dim3 dim(1, 1, 1);
107  if constexpr(TDim::value >= 1)
108  dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
109  if constexpr(TDim::value >= 2)
110  dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
111  if constexpr(TDim::value >= 3)
112  dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
113  checkVecOnly3Dim(vec);
114  return dim;
115  }
116  } // namespace detail
117  } // namespace uniform_cuda_hip
118 
119  //! The GPU CUDA/HIP accelerator execution task.
120  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
121  class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
122  {
123  public:
124  template<typename TWorkDiv>
126  TWorkDiv&& workDiv,
127  TKernelFnObj const& kernelFnObj,
128  TArgs&&... args)
129  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
130  , m_kernelFnObj(kernelFnObj)
131  , m_args(std::forward<TArgs>(args)...)
132  {
133  static_assert(
134  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
135  "The work division and the execution task have to be of the same dimensionality!");
136  }
137 
138  TKernelFnObj m_kernelFnObj;
139  std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
140  };
141 
142  namespace trait
143  {
144  //! The GPU CUDA/HIP execution task accelerator type trait specialization.
145  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
146  struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
147  {
149  };
150 
151  //! The GPU CUDA/HIP execution task device type trait specialization.
152  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
153  struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
154  {
156  };
157 
158  //! The GPU CUDA/HIP execution task dimension getter trait specialization.
159  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
160  struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
161  {
162  using type = TDim;
163  };
164 
165  //! The CPU CUDA/HIP execution task platform type trait specialization.
166  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
167  struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
168  {
170  };
171 
172  //! The GPU CUDA/HIP execution task idx type trait specialization.
173  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
174  struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
175  {
176  using type = TIdx;
177  };
178 
179  //! The CUDA/HIP kernel enqueue trait specialization.
180  template<
181  typename TApi,
182  bool TBlocking,
183  typename TAcc,
184  typename TDim,
185  typename TIdx,
186  typename TKernelFnObj,
187  typename... TArgs>
188  struct Enqueue<
189  uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
190  TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
191  {
192  ALPAKA_FN_HOST static auto enqueue(
195  {
197  // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
198 
199 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
200  // std::size_t printfFifoSize;
201  // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
202  // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
203  // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
204  // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
205  // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
206 # endif
207  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
208  auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
209  auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
210 
211  dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
212  dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
214 
215 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
216  std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
217  std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
218  << ")\n";
219 # endif
220 
221 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
222  // This checks for a valid work division that is also compliant with the hardware maxima of the
223  // accelerator.
224  if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
225  {
226  throw std::runtime_error(
227  "The given work division is not valid or not supported by the device of type "
229  }
230 # endif
231 
232  // Get the size of the block shared dynamic memory.
233  auto const blockSharedMemDynSizeBytes = std::apply(
234  [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
235  return getBlockSharedMemDynSizeBytes<TAcc>(
236  task.m_kernelFnObj,
237  blockThreadExtent,
238  threadElemExtent,
239  args...);
240  },
241  task.m_args);
242 
243 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
244  // Log the block shared memory idx.
245  std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
246  << std::endl;
247 # endif
248 
249  auto kernelName = alpaka::detail::
250  gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
251 
252 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
253  // Log the function attributes.
254  typename TApi::FuncAttributes_t funcAttrs;
255  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
256  std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
257  << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
258  << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
259  << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
260  << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
261  << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
262 # endif
263 
264  // Set the current device.
265  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
266 
267  // Enqueue the kernel execution.
268  // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
269  // language extension expects the arguments by value. This forces the type of a float argument given
270  // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
271  // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
272  // value location.
273  std::apply(
274  [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
275  {
276  kernelName<<<
277  gridDim,
278  blockDim,
279  static_cast<std::size_t>(blockSharedMemDynSizeBytes),
280  queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
281  },
282  task.m_args);
283 
284  if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
285  {
286  // Wait for the kernel execution to finish but do not check error return of this call.
287  // Do not use the alpaka::wait method because it checks the error itself but we want to give a
288  // custom error message.
289  std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
290  }
291  if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
292  {
293  auto const msg
294  = std::string{"execution of kernel '" + core::demangled<TKernelFnObj> + "' failed with"};
295  ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
296  }
297  }
298  };
299 
300  //! \brief Specialisation of the class template FunctionAttributes
301  //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
302  //! \tparam TDim The dimensionality of the accelerator device properties.
303  //! \tparam TIdx The idx type of the accelerator device properties.
304  //! \tparam TKernelFn Kernel function object type.
305  //! \tparam TArgs Kernel function object argument types as a parameter pack.
306  template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
307  struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
308  {
309  //! \param dev The device instance
310  //! \param kernelFn The kernel function object which should be executed.
311  //! \param args The kernel invocation arguments.
312  //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
313  //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
315  [[maybe_unused]] TDev const& dev,
316  [[maybe_unused]] TKernelFn const& kernelFn,
317  [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
318  {
319  auto kernelName = alpaka::detail::gpuKernel<
320  TKernelFn,
321  TApi,
323  TDim,
324  TIdx,
326 
327  typename TApi::FuncAttributes_t funcAttrs;
328 # if BOOST_COMP_GNUC
329  // Disable and enable compile warnings for gcc
330 # pragma GCC diagnostic push
331 # pragma GCC diagnostic ignored "-Wconditionally-supported"
332 # endif
334  TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
335 # if BOOST_COMP_GNUC
336 # pragma GCC diagnostic pop
337 # endif
338 
339  alpaka::KernelFunctionAttributes kernelFunctionAttributes;
340  kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
341  kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
342  kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
343  kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
344  kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
345  kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
346  kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
347 
348 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
349  printf("Kernel Function Attributes: \n");
350  printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
351  printf(
352  "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
353  "\n",
354  funcAttrs.constSizeBytes,
355  funcAttrs.localSizeBytes,
356  funcAttrs.sharedSizeBytes,
357  funcAttrs.maxDynamicSharedSizeBytes);
358 
359  printf(
360  "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
361  funcAttrs.numRegs,
362  funcAttrs.ptxVersion,
363  funcAttrs.maxThreadsPerBlock);
364 # endif
365  return kernelFunctionAttributes;
366  }
367  };
368  } // namespace trait
369 } // namespace alpaka
370 
371 # endif
372 
373 #endif
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
Definition: Debug.hpp:16
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition: Debug.hpp:22
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
std::tuple< remove_restrict_t< std::decay_t< TArgs > >... > m_args
ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
__global__ void gpuKernel(Vec< TDim, TIdx > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition: Traits.hpp:1280
ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec< TDim, TIdx > const &vec) -> dim3
ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec< TDim, TIdx > const &vec) -> void
The alpaka accelerator library.
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
Definition: Traits.hpp:68
ALPAKA_FN_HOST auto getAccName() -> std::string
Definition: Traits.hpp:100
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TaskKernelGpuUniformCudaHipRt< TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
The queue enqueue trait.
Definition: Traits.hpp:27
static ALPAKA_FN_HOST auto getFunctionAttributes([[maybe_unused]] TDev const &dev, [[maybe_unused]] TKernelFn const &kernelFn, [[maybe_unused]] TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition: Traits.hpp:79
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30