alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, RenĂ© Widera, Jan Stephan, Andrea Bocci, Bernhard
2  * Manfred Gruber, Antonio Di Pilato
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
9 #include "alpaka/acc/Traits.hpp"
11 #include "alpaka/core/Cuda.hpp"
12 #include "alpaka/core/Decay.hpp"
14 #include "alpaka/core/Hip.hpp"
17 #include "alpaka/dev/Traits.hpp"
18 #include "alpaka/dim/Traits.hpp"
19 #include "alpaka/idx/Traits.hpp"
20 #include "alpaka/kernel/Traits.hpp"
24 #include "alpaka/queue/Traits.hpp"
27 
28 #include <stdexcept>
29 #include <tuple>
30 #include <type_traits>
31 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
32 # include <iostream>
33 #endif
34 
35 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
36 
37 # if !defined(ALPAKA_HOST_ONLY)
38 
40 
41 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
42 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
43 # endif
44 
45 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
46 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
47 # endif
48 
49 namespace alpaka
50 {
51  namespace detail
52  {
53 # if BOOST_COMP_CLANG
54 # pragma clang diagnostic push
55 # pragma clang diagnostic ignored "-Wunused-template"
56 # endif
57  //! The GPU CUDA/HIP kernel entry point.
58  // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
59  // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
60  // profiler (e.g. ncu) is as shorter as possible.
61  template<typename TKernelFnObj, typename TApi, typename TAcc, typename TDim, typename TIdx, typename... TArgs>
62  __global__ void gpuKernel(
63  Vec<TDim, TIdx> const threadElemExtent,
64  TKernelFnObj const kernelFnObj,
65  TArgs... args)
66  {
67 # if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
68 # error "Device capability >= 2.0 is required!"
69 # endif
70 
71  const TAcc acc(threadElemExtent);
72 
73 // with clang it is not possible to query std::result_of for a pure device lambda created on the host side
74 # if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
75  static_assert(
76  std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
77  "The TKernelFnObj is required to return void!");
78 # endif
79  kernelFnObj(const_cast<TAcc const&>(acc), args...);
80  }
81 # if BOOST_COMP_CLANG
82 # pragma clang diagnostic pop
83 # endif
84  } // namespace detail
85 
86  namespace uniform_cuda_hip
87  {
88  namespace detail
89  {
90  template<typename TDim, typename TIdx>
92  {
93  if constexpr(TDim::value > 0)
94  {
95  for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
96  {
97  if(vec[TDim::value - 1u - i] != 1)
98  {
99  throw std::runtime_error(
100  "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
101  "work division extents of the dimensions higher 3 have to be 1!");
102  }
103  }
104  }
105  }
106 
107  template<typename TDim, typename TIdx>
109  {
110  dim3 dim(1, 1, 1);
111  if constexpr(TDim::value >= 1)
112  dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
113  if constexpr(TDim::value >= 2)
114  dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
115  if constexpr(TDim::value >= 3)
116  dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
117  checkVecOnly3Dim(vec);
118  return dim;
119  }
120  } // namespace detail
121  } // namespace uniform_cuda_hip
122 
123  //! The GPU CUDA/HIP accelerator execution task.
124  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
125  class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
126  {
127  public:
128  template<typename TWorkDiv>
130  TWorkDiv&& workDiv,
131  TKernelFnObj const& kernelFnObj,
132  TArgs&&... args)
133  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
134  , m_kernelFnObj(kernelFnObj)
135  , m_args(std::forward<TArgs>(args)...)
136  {
137  static_assert(
138  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
139  "The work division and the execution task have to be of the same dimensionality!");
140  }
141 
142  TKernelFnObj m_kernelFnObj;
143  std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
144  };
145 
146  namespace trait
147  {
148  //! The GPU CUDA/HIP execution task accelerator type trait specialization.
149  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
150  struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
151  {
153  };
154 
155  //! The GPU CUDA/HIP execution task device type trait specialization.
156  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
157  struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
158  {
160  };
161 
162  //! The GPU CUDA/HIP execution task dimension getter trait specialization.
163  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
164  struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
165  {
166  using type = TDim;
167  };
168 
169  //! The CPU CUDA/HIP execution task platform type trait specialization.
170  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
171  struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
172  {
174  };
175 
176  //! The GPU CUDA/HIP execution task idx type trait specialization.
177  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
178  struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
179  {
180  using type = TIdx;
181  };
182 
183  //! The CUDA/HIP non-blocking kernel enqueue trait specialization.
184  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
185  struct Enqueue<
187  TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
188  {
189  ALPAKA_FN_HOST static auto enqueue(
192  {
194  // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
195 
196 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
197  // std::size_t printfFifoSize;
198  // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
199  // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
200  // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
201  // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
202  // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
203 # endif
204  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
205  auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
206  auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
207 
208  dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
209  dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
211 
212 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
213  std::cout << __func__ << " gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x
214  << " blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x << std::endl;
215 # endif
216 
217 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
218  // This checks for a valid work division that is also compliant with the maxima of the accelerator.
219  if(!isValidWorkDiv<TAcc>(getDev(queue), task))
220  {
221  throw std::runtime_error(
222  "The given work division is not valid or not supported by the device of type "
224  }
225 # endif
226 
227  // Get the size of the block shared dynamic memory.
228  auto const blockSharedMemDynSizeBytes = std::apply(
229  [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
230  return getBlockSharedMemDynSizeBytes<TAcc>(
231  task.m_kernelFnObj,
232  blockThreadExtent,
233  threadElemExtent,
234  args...);
235  },
236  task.m_args);
237 
238 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
239  // Log the block shared memory idx.
240  std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
241  << std::endl;
242 # endif
243  auto kernelName = alpaka::detail::
244  gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
245 
246 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
247  // Log the function attributes.
248  typename TApi::FuncAttributes_t funcAttrs;
249  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
250  std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
251  << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
252  << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
253  << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
254  << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
255  << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
256 # endif
257 
258  // Set the current device.
259  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
260  // Enqueue the kernel execution.
261  // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
262  // language extension expects the arguments by value. This forces the type of a float argument given
263  // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
264  // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
265  // value location.
266  std::apply(
267  [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
268  {
269  kernelName<<<
270  gridDim,
271  blockDim,
272  static_cast<std::size_t>(blockSharedMemDynSizeBytes),
273  queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
274  },
275  task.m_args);
276 
277  if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
278  {
279  // Wait for the kernel execution to finish but do not check error return of this call.
280  // Do not use the alpaka::wait method because it checks the error itself but we want to give a
281  // custom error message.
282  std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
283  auto const msg = std::string{
284  "'execution of kernel: '" + std::string{core::demangled<TKernelFnObj>} + "' failed with"};
285  ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
286  }
287  }
288  };
289 
290  //! The CUDA/HIP synchronous kernel enqueue trait specialization.
291  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
292  struct Enqueue<
294  TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
295  {
296  ALPAKA_FN_HOST static auto enqueue(
299  {
301  // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
302 
303 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
304  // std::size_t printfFifoSize;
305  // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
306  // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
307  // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
308  // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
309  // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
310 # endif
311  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
312  auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
313  auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
314 
315  dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
316  dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
318 
319 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
320  std::cout << __func__ << "gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x << std::endl;
321  std::cout << __func__ << "blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x
322  << std::endl;
323 # endif
324 
325 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
326  // This checks for a valid work division that is also compliant with the maxima of the accelerator.
327  if(!isValidWorkDiv<TAcc>(getDev(queue), task))
328  {
329  throw std::runtime_error(
330  "The given work division is not valid or not supported by the device of type "
332  }
333 # endif
334 
335  // Get the size of the block shared dynamic memory.
336  auto const blockSharedMemDynSizeBytes = std::apply(
337  [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
338  return getBlockSharedMemDynSizeBytes<TAcc>(
339  task.m_kernelFnObj,
340  blockThreadExtent,
341  threadElemExtent,
342  args...);
343  },
344  task.m_args);
345 
346 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
347  // Log the block shared memory idx.
348  std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
349  << std::endl;
350 # endif
351 
352  auto kernelName = alpaka::detail::
353  gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
354 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
355  // Log the function attributes.
356  typename TApi::FuncAttributes_t funcAttrs;
357  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
358  std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
359  << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
360  << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
361  << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
362  << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
363  << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
364 # endif
365 
366  // Set the current device.
367  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
368 
369  // Enqueue the kernel execution.
370  std::apply(
371  [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
372  {
373  kernelName<<<
374  gridDim,
375  blockDim,
376  static_cast<std::size_t>(blockSharedMemDynSizeBytes),
377  queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
378  },
379  task.m_args);
380 
381  // Wait for the kernel execution to finish but do not check error return of this call.
382  // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom
383  // error message.
384  std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
385  if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
386  {
387  auto const msg
388  = std::string{"'execution of kernel: '" + core::demangled<TKernelFnObj> + "' failed with"};
389  ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
390  }
391  }
392  };
393  } // namespace trait
394 } // namespace alpaka
395 
396 # endif
397 
398 #endif
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
Definition: Debug.hpp:16
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition: Debug.hpp:22
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
std::tuple< remove_restrict_t< std::decay_t< TArgs > >... > m_args
ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
__global__ void gpuKernel(Vec< TDim, TIdx > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition: Traits.hpp:1280
ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec< TDim, TIdx > const &vec) -> dim3
ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec< TDim, TIdx > const &vec) -> void
The alpaka accelerator library.
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
Definition: Traits.hpp:68
ALPAKA_FN_HOST auto getAccName() -> std::string
Definition: Traits.hpp:72
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, TaskKernelGpuUniformCudaHipRt< TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, TaskKernelGpuUniformCudaHipRt< TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
The queue enqueue trait.
Definition: Traits.hpp:27
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30