alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
2 * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
11#include "alpaka/core/Cuda.hpp"
12#include "alpaka/core/Decay.hpp"
14#include "alpaka/core/Hip.hpp"
17#include "alpaka/dev/Traits.hpp"
18#include "alpaka/dim/Traits.hpp"
19#include "alpaka/idx/Traits.hpp"
27
28#include <stdexcept>
29#include <tuple>
30#include <type_traits>
31
32#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
33# include <iostream>
34#endif
35
36#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
37
38# if !defined(ALPAKA_HOST_ONLY)
39
41
42# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
43# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
44# endif
45
46# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
47# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
48# endif
49
50namespace alpaka
51{
52 namespace detail
53 {
54# if BOOST_COMP_CLANG
55# pragma clang diagnostic push
56# pragma clang diagnostic ignored "-Wunused-template"
57# endif
58 //! The GPU CUDA/HIP kernel entry point.
59 // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
60 // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
61 // profiler (e.g. ncu) is as shorter as possible.
62 template<typename TKernelFnObj, typename TAcc, typename... TArgs>
63 __global__ void gpuKernel(
64 Vec<Dim<TAcc>, Idx<TAcc>> const threadElemExtent,
65 TKernelFnObj const kernelFnObj,
66 TArgs... args)
67 {
68 TAcc const acc(threadElemExtent);
69
70// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
71# if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
72 static_assert(
73 std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
74 "The TKernelFnObj is required to return void!");
75# endif
76 kernelFnObj(const_cast<TAcc const&>(acc), args...);
77 }
78# if BOOST_COMP_CLANG
79# pragma clang diagnostic pop
80# endif
81
82 template<typename TKernelFnObj, typename TAcc, typename... TArgs>
83 inline void (*kernelName)(
85 TKernelFnObj const,
87 = gpuKernel<TKernelFnObj, TAcc, TArgs...>;
88
89 } // namespace detail
90
91 namespace uniform_cuda_hip
92 {
93 namespace detail
94 {
95 template<typename TDim, typename TIdx>
97 {
98 if constexpr(TDim::value > 0)
99 {
100 for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
101 {
102 if(vec[TDim::value - 1u - i] != 1)
103 {
104 throw std::runtime_error(
105 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
106 "work division extents of the dimensions higher 3 have to be 1!");
107 }
108 }
109 }
110 }
111
112 template<typename TDim, typename TIdx>
114 {
115 dim3 dim(1, 1, 1);
116 if constexpr(TDim::value >= 1)
117 dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
118 if constexpr(TDim::value >= 2)
119 dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
120 if constexpr(TDim::value >= 3)
121 dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
122 checkVecOnly3Dim(vec);
123 return dim;
124 }
125 } // namespace detail
126 } // namespace uniform_cuda_hip
127
128 //! The GPU CUDA/HIP accelerator execution task.
129 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
130 class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
131 {
132 public:
133 template<typename TWorkDiv>
135 TWorkDiv&& workDiv,
136 TKernelFnObj const& kernelFnObj,
137 TArgs&&... args)
138 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
139 , m_kernelFnObj(kernelFnObj)
140 , m_args(std::forward<TArgs>(args)...)
141 {
142 static_assert(
143 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
144 "The work division and the execution task have to be of the same dimensionality!");
145 }
146
147 TKernelFnObj m_kernelFnObj;
148 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
149 };
150
151 namespace trait
152 {
153 //! The GPU CUDA/HIP execution task accelerator type trait specialization.
154 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
155 struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
156 {
158 };
159
160 //! The GPU CUDA/HIP execution task device type trait specialization.
161 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
162 struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
163 {
165 };
166
167 //! The GPU CUDA/HIP execution task dimension getter trait specialization.
168 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
169 struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
170 {
171 using type = TDim;
172 };
173
174 //! The CPU CUDA/HIP execution task platform type trait specialization.
175 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
176 struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
177 {
179 };
180
181 //! The GPU CUDA/HIP execution task idx type trait specialization.
182 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
183 struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
184 {
185 using type = TIdx;
186 };
187
188 //! The CUDA/HIP kernel enqueue trait specialization.
189 template<
190 typename TApi,
191 bool TBlocking,
192 typename TAcc,
193 typename TDim,
194 typename TIdx,
195 typename TKernelFnObj,
196 typename... TArgs>
197 struct Enqueue<
198 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
199 TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
200 {
204 {
206 // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
207
208# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
209 // std::size_t printfFifoSize;
210 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
211 // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
212 // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
213 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
214 // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
215# endif
216 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
217 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
218 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
219
220 dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
221 dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
223
224# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
225 std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
226 std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
227 << ")\n";
228# endif
229
230# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
231 // This checks for a valid work division that is also compliant with the hardware maxima of the
232 // accelerator.
233 if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
234 {
235 throw std::runtime_error(
236 "The given work division is not valid or not supported by the device of type "
238 }
239# endif
240
241 // Get the size of the block shared dynamic memory.
242 auto const blockSharedMemDynSizeBytes = std::apply(
243 [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
244 return getBlockSharedMemDynSizeBytes<TAcc>(
245 task.m_kernelFnObj,
246 blockThreadExtent,
247 threadElemExtent,
248 args...);
249 },
250 task.m_args);
251
252# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
253 // Log the block shared memory idx.
254 std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
255 << std::endl;
256# endif
257
258 auto kernelName
259 = alpaka::detail::kernelName<TKernelFnObj, TAcc, remove_restrict_t<std::decay_t<TArgs>>...>;
260
261# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
262 // Log the function attributes.
263 typename TApi::FuncAttributes_t funcAttrs;
264 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
265 std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
266 << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
267 << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
268 << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
269 << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
270 << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
271# endif
272
273 // Set the current device.
274 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
275
276 // Enqueue the kernel execution.
277 // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
278 // language extension expects the arguments by value. This forces the type of a float argument given
279 // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
280 // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
281 // value location.
282 std::apply(
283 [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
284 {
285 kernelName<<<
286 gridDim,
287 blockDim,
288 static_cast<std::size_t>(blockSharedMemDynSizeBytes),
289 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
290 },
291 task.m_args);
292
293 if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
294 {
295 // Wait for the kernel execution to finish but do not check error return of this call.
296 // Do not use the alpaka::wait method because it checks the error itself but we want to give a
297 // custom error message.
298 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
299 }
300 if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
301 {
302 auto const msg
303 = std::string{"execution of kernel '" + core::demangled<TKernelFnObj> + "' failed with"};
304 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
305 }
306 }
307 };
308
309 //! \brief Specialisation of the class template FunctionAttributes
310 //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
311 //! \tparam TDim The dimensionality of the accelerator device properties.
312 //! \tparam TIdx The idx type of the accelerator device properties.
313 //! \tparam TKernelFn Kernel function object type.
314 //! \tparam TArgs Kernel function object argument types as a parameter pack.
315 template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
316 struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
317 {
318 //! \param dev The device instance
319 //! \param kernelFn The kernel function object which should be executed.
320 //! \param args The kernel invocation arguments.
321 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
322 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
324 [[maybe_unused]] TDev const& dev,
325 [[maybe_unused]] TKernelFn const& kernelFn,
326 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
327 {
328 auto kernelName = alpaka::detail::kernelName<
329 TKernelFn,
332
333 typename TApi::FuncAttributes_t funcAttrs;
334# if BOOST_COMP_GNUC
335 // Disable and enable compile warnings for gcc
336# pragma GCC diagnostic push
337# pragma GCC diagnostic ignored "-Wconditionally-supported"
338# endif
340 TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
341# if BOOST_COMP_GNUC
342# pragma GCC diagnostic pop
343# endif
344
345 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
346 kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
347 kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
348 kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
349 kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
350 kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
351 kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
352 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
353
354# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
355 printf("Kernel Function Attributes: \n");
356 printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
357 printf(
358 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
359 "\n",
360 funcAttrs.constSizeBytes,
361 funcAttrs.localSizeBytes,
362 funcAttrs.sharedSizeBytes,
363 funcAttrs.maxDynamicSharedSizeBytes);
364
365 printf(
366 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
367 funcAttrs.numRegs,
368 funcAttrs.ptxVersion,
369 funcAttrs.maxThreadsPerBlock);
370# endif
371 return kernelFunctionAttributes;
372 }
373 };
374
375 } // namespace trait
376
377} // namespace alpaka
378
379// These macros can be used to give a more readable name to a GPU kernel.
380// KERNEL must be the class or struct whose operator()(acc, ...) implements the kernel.
381// ::NAME (or ::NAMESPACE::NAME) must be a unique name across the whole program.
382
383struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace;
384
385# define ALPAKA_KERNEL_NAME(KERNEL, NAME) \
386 \
387 struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace; \
388 \
389 static_assert( \
390 std::is_same_v< \
391 The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace, \
392 ::The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace>, \
393 "The ALPAKA_KERNEL_NAME macro must be called in the global namespace"); \
394 \
395 template<typename TAcc, typename... TArgs> \
396 __global__ void NAME( \
397 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
398 KERNEL const kernelFnObj, \
399 TArgs... args) \
400 { \
401 TAcc const acc(extent); \
402 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
403 } \
404 \
405 namespace alpaka::detail \
406 { \
407 template<typename TAcc, typename... TArgs> \
408 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
409 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
410 KERNEL const, \
411 TArgs...) \
412 = ::NAME<TAcc, TArgs...>; \
413 }
414
415struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace;
416
417# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME) \
418 struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace; \
419 \
420 static_assert( \
421 std::is_same_v< \
422 The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace, \
423 ::The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace>, \
424 "The ALPAKA_KERNEL_SCOPED_NAME macro must be called in the global namespace"); \
425 \
426 namespace NAMESPACE \
427 { \
428 template<typename TAcc, typename... TArgs> \
429 __global__ void NAME( \
430 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
431 KERNEL const kernelFnObj, \
432 TArgs... args) \
433 { \
434 TAcc const acc(extent); \
435 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
436 } \
437 } \
438 \
439 namespace alpaka::detail \
440 { \
441 template<typename TAcc, typename... TArgs> \
442 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
443 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
444 KERNEL const, \
445 TArgs...) \
446 = ::NAMESPACE::NAME<TAcc, TArgs...>; \
447 }
448
449
450# else
451
452// In host-only mode, expand to empty macros
453
454# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
455# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
456
457# endif
458
459#else
460
461// If CUDA or HIP are not available, expand to empty macros
462
463# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
464# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
465
466#endif
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
Definition Debug.hpp:16
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition Debug.hpp:22
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
std::tuple< remove_restrict_t< std::decay_t< TArgs > >... > m_args
ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A n-dimensional vector.
Definition Vec.hpp:38
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
__global__ void gpuKernel(Vec< Dim< TAcc >, Idx< TAcc > > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
void(* kernelName)(Vec< Dim< TAcc >, Idx< TAcc > > const, TKernelFnObj const, remove_restrict_t< std::decay_t< TArgs > >...)
ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec< TDim, TIdx > const &vec) -> dim3
ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec< TDim, TIdx > const &vec) -> void
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68
ALPAKA_FN_HOST auto getAccName() -> std::string
Definition Traits.hpp:100
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The accelerator type trait.
Definition Traits.hpp:37
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TaskKernelGpuUniformCudaHipRt< TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
The queue enqueue trait.
Definition Traits.hpp:27
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFn const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition Traits.hpp:79
The idx type trait.
Definition Traits.hpp:25
The platform type trait.
Definition Traits.hpp:30