alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
2 * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
11#include "alpaka/core/Cuda.hpp"
12#include "alpaka/core/Decay.hpp"
14#include "alpaka/core/Hip.hpp"
17#include "alpaka/dev/Traits.hpp"
18#include "alpaka/dim/Traits.hpp"
19#include "alpaka/idx/Traits.hpp"
27
28#include <stdexcept>
29#include <string>
30#include <tuple>
31#include <type_traits>
32
33#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
34# include <iostream>
35#endif
36
37#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
38
39# if !defined(ALPAKA_HOST_ONLY)
40
41# include "alpaka/core/Config.hpp"
42
43# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
44# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
45# endif
46
47# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
48# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
49# endif
50
51namespace alpaka
52{
53 namespace detail
54 {
55# if ALPAKA_COMP_CLANG
56# pragma clang diagnostic push
57# pragma clang diagnostic ignored "-Wunused-template"
58# endif
59 //! The GPU CUDA/HIP kernel entry point.
60 // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
61 // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
62 // profiler (e.g. ncu) is as shorter as possible.
63 template<typename TKernelFnObj, typename TAcc, typename... TArgs>
64 __global__ void gpuKernel(
65 Vec<Dim<TAcc>, Idx<TAcc>> const threadElemExtent,
66 TKernelFnObj const kernelFnObj,
67 TArgs... args)
68 {
69 TAcc const acc(threadElemExtent);
70
71// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
72# if !(ALPAKA_COMP_CLANG_CUDA && ALPAKA_COMP_CLANG)
73 static_assert(
74 std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
75 "The TKernelFnObj is required to return void!");
76# endif
77 kernelFnObj(const_cast<TAcc const&>(acc), args...);
78 }
79# if ALPAKA_COMP_CLANG
80# pragma clang diagnostic pop
81# endif
82
83 template<typename TKernelFnObj, typename TAcc, typename... TArgs>
84 inline void (*kernelName)(
86 TKernelFnObj const,
88 = gpuKernel<TKernelFnObj, TAcc, TArgs...>;
89
90 } // namespace detail
91
92 namespace uniform_cuda_hip
93 {
94 namespace detail
95 {
96 template<typename TDim, typename TIdx>
98 {
99 if constexpr(TDim::value > 0)
100 {
101 for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
102 {
103 if(vec[TDim::value - 1u - i] != 1)
104 {
105 throw std::runtime_error(
106 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
107 "work division extents of the dimensions higher 3 have to be 1!");
108 }
109 }
110 }
111 }
112
113 template<typename TDim, typename TIdx>
115 {
116 dim3 dim(1, 1, 1);
117 if constexpr(TDim::value >= 1)
118 dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
119 if constexpr(TDim::value >= 2)
120 dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
121 if constexpr(TDim::value >= 3)
122 dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
123 checkVecOnly3Dim(vec);
124 return dim;
125 }
126 } // namespace detail
127 } // namespace uniform_cuda_hip
128
129 //! The GPU CUDA/HIP accelerator execution task.
130 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
131 class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
132 {
133 public:
134 template<typename TWorkDiv>
136 TWorkDiv&& workDiv,
137 TKernelFnObj const& kernelFnObj,
138 TArgs&&... args)
139 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
140 , m_kernelFnObj(kernelFnObj)
141 , m_args(std::forward<TArgs>(args)...)
142 {
143 static_assert(
144 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
145 "The work division and the execution task have to be of the same dimensionality!");
146 }
147
148 TKernelFnObj m_kernelFnObj;
149 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
150 };
151
152 namespace trait
153 {
154 //! The GPU CUDA/HIP execution task accelerator type trait specialization.
155 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
156 struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
157 {
159 };
160
161 //! The GPU CUDA/HIP execution task device type trait specialization.
162 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
163 struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
164 {
166 };
167
168 //! The GPU CUDA/HIP execution task dimension getter trait specialization.
169 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
170 struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
171 {
172 using type = TDim;
173 };
174
175 //! The CPU CUDA/HIP execution task platform type trait specialization.
176 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
177 struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
178 {
180 };
181
182 //! The GPU CUDA/HIP execution task idx type trait specialization.
183 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
184 struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
185 {
186 using type = TIdx;
187 };
188
189 //! The CUDA/HIP kernel enqueue trait specialization.
190 template<
191 typename TApi,
192 bool TBlocking,
193 typename TAcc,
194 typename TDim,
195 typename TIdx,
196 typename TKernelFnObj,
197 typename... TArgs>
198 struct Enqueue<
199 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
200 TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
201 {
205 {
207 // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
208
209# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
210 // std::size_t printfFifoSize;
211 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
212 // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
213 // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
214 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
215 // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
216# endif
217 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
218 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
219 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
220
221 dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
222 dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
224
225# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
226 std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
227 std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
228 << ")\n";
229# endif
230
231# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
232 // This checks for a valid work division that is also compliant with the hardware maxima of the
233 // accelerator.
234 if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
235 {
236 throw std::runtime_error(
237 "The given work division is not valid or not supported by the device of type "
239 }
240# endif
241
242 // Get the size of the block shared dynamic memory.
243 auto const blockSharedMemDynSizeBytes = std::apply(
244 [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
245 return getBlockSharedMemDynSizeBytes<TAcc>(
246 task.m_kernelFnObj,
247 blockThreadExtent,
248 threadElemExtent,
249 args...);
250 },
251 task.m_args);
252
253# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
254 // Log the block shared memory idx.
255 std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
256 << std::endl;
257# endif
258
259 auto kernelName
260 = alpaka::detail::kernelName<TKernelFnObj, TAcc, remove_restrict_t<std::decay_t<TArgs>>...>;
261
262# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
263 // Log the function attributes.
264 typename TApi::FuncAttributes_t funcAttrs;
265 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
266 std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
267 << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
268 << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
269 << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
270 << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
271 << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
272# endif
273
274 // Set the current device.
275 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
276
277 // Enqueue the kernel execution.
278 // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
279 // language extension expects the arguments by value. This forces the type of a float argument given
280 // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
281 // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
282 // value location.
283 std::apply(
284 [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
285 {
286 kernelName<<<
287 gridDim,
288 blockDim,
289 static_cast<std::size_t>(blockSharedMemDynSizeBytes),
290 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
291 },
292 task.m_args);
293
294 if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
295 {
296 // Wait for the kernel execution to finish but do not check error return of this call.
297 // Do not use the alpaka::wait method because it checks the error itself but we want to give a
298 // custom error message.
299 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
300 }
301 if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
302 {
303 using namespace std::literals;
304 std::string const msg
305 = "execution of kernel '"s + std::string(core::demangled<TKernelFnObj>) + "' failed with"s;
306 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
307 }
308 }
309 };
310
311 //! \brief Specialisation of the class template FunctionAttributes
312 //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
313 //! \tparam TDim The dimensionality of the accelerator device properties.
314 //! \tparam TIdx The idx type of the accelerator device properties.
315 //! \tparam TKernelFn Kernel function object type.
316 //! \tparam TArgs Kernel function object argument types as a parameter pack.
317 template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
318 struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
319 {
320 //! \param dev The device instance
321 //! \param kernelFn The kernel function object which should be executed.
322 //! \param args The kernel invocation arguments.
323 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
324 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
326 [[maybe_unused]] TDev const& dev,
327 [[maybe_unused]] TKernelFn const& kernelFn,
328 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
329 {
330 auto kernelName = alpaka::detail::kernelName<
331 TKernelFn,
334
335 typename TApi::FuncAttributes_t funcAttrs;
336# if ALPAKA_COMP_GNUC
337 // Disable and enable compile warnings for gcc
338# pragma GCC diagnostic push
339# pragma GCC diagnostic ignored "-Wconditionally-supported"
340# endif
342 TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
343# if ALPAKA_COMP_GNUC
344# pragma GCC diagnostic pop
345# endif
346
347 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
348 kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
349 kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
350 kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
351 kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
352 kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
353 kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
354 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
355
356# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
357 printf("Kernel Function Attributes: \n");
358 printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
359 printf(
360 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
361 "\n",
362 funcAttrs.constSizeBytes,
363 funcAttrs.localSizeBytes,
364 funcAttrs.sharedSizeBytes,
365 funcAttrs.maxDynamicSharedSizeBytes);
366
367 printf(
368 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
369 funcAttrs.numRegs,
370 funcAttrs.ptxVersion,
371 funcAttrs.maxThreadsPerBlock);
372# endif
373 return kernelFunctionAttributes;
374 }
375 };
376
377 } // namespace trait
378
379} // namespace alpaka
380
381// These macros can be used to give a more readable name to a GPU kernel.
382// KERNEL must be the class or struct whose operator()(acc, ...) implements the kernel.
383// ::NAME (or ::NAMESPACE::NAME) must be a unique name across the whole program.
384
385struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace;
386
387# define ALPAKA_KERNEL_NAME(KERNEL, NAME) \
388 \
389 struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace; \
390 \
391 static_assert( \
392 std::is_same_v< \
393 The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace, \
394 ::The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace>, \
395 "The ALPAKA_KERNEL_NAME macro must be called in the global namespace"); \
396 \
397 template<typename TAcc, typename... TArgs> \
398 __global__ void NAME( \
399 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
400 KERNEL const kernelFnObj, \
401 TArgs... args) \
402 { \
403 TAcc const acc(extent); \
404 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
405 } \
406 \
407 namespace alpaka::detail \
408 { \
409 template<typename TAcc, typename... TArgs> \
410 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
411 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
412 KERNEL const, \
413 TArgs...) \
414 = ::NAME<TAcc, TArgs...>; \
415 }
416
417struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace;
418
419# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME) \
420 struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace; \
421 \
422 static_assert( \
423 std::is_same_v< \
424 The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace, \
425 ::The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace>, \
426 "The ALPAKA_KERNEL_SCOPED_NAME macro must be called in the global namespace"); \
427 \
428 namespace NAMESPACE \
429 { \
430 template<typename TAcc, typename... TArgs> \
431 __global__ void NAME( \
432 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
433 KERNEL const kernelFnObj, \
434 TArgs... args) \
435 { \
436 TAcc const acc(extent); \
437 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
438 } \
439 } \
440 \
441 namespace alpaka::detail \
442 { \
443 template<typename TAcc, typename... TArgs> \
444 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
445 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
446 KERNEL const, \
447 TArgs...) \
448 = ::NAMESPACE::NAME<TAcc, TArgs...>; \
449 }
450
451
452# else
453
454// In host-only mode, expand to empty macros
455
456# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
457# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
458
459# endif
460
461#else
462
463// If CUDA or HIP are not available, expand to empty macros
464
465# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
466# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
467
468#endif
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
Definition Debug.hpp:16
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition Debug.hpp:22
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
std::tuple< remove_restrict_t< std::decay_t< TArgs > >... > m_args
ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A n-dimensional vector.
Definition Vec.hpp:38
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
__global__ void gpuKernel(Vec< Dim< TAcc >, Idx< TAcc > > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
void(* kernelName)(Vec< Dim< TAcc >, Idx< TAcc > > const, TKernelFnObj const, remove_restrict_t< std::decay_t< TArgs > >...)
ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec< TDim, TIdx > const &vec) -> dim3
ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec< TDim, TIdx > const &vec) -> void
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68
ALPAKA_FN_HOST auto getAccName() -> std::string
Definition Traits.hpp:100
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The accelerator type trait.
Definition Traits.hpp:37
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TaskKernelGpuUniformCudaHipRt< TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
The queue enqueue trait.
Definition Traits.hpp:27
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFn const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition Traits.hpp:79
The idx type trait.
Definition Traits.hpp:25
The platform type trait.
Definition Traits.hpp:30