alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
2 * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
11#include "alpaka/core/Cuda.hpp"
12#include "alpaka/core/Decay.hpp"
14#include "alpaka/core/Hip.hpp"
17#include "alpaka/dev/Traits.hpp"
18#include "alpaka/dim/Traits.hpp"
19#include "alpaka/idx/Traits.hpp"
27
28#include <stdexcept>
29#include <tuple>
30#include <type_traits>
31
32#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
33# include <iostream>
34#endif
35
36#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
37
38# if !defined(ALPAKA_HOST_ONLY)
39
41
42# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
43# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
44# endif
45
46# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
47# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
48# endif
49
50namespace alpaka
51{
52 namespace detail
53 {
54# if BOOST_COMP_CLANG
55# pragma clang diagnostic push
56# pragma clang diagnostic ignored "-Wunused-template"
57# endif
58 //! The GPU CUDA/HIP kernel entry point.
59 // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
60 // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
61 // profiler (e.g. ncu) is as shorter as possible.
62 template<typename TKernelFnObj, typename TApi, typename TAcc, typename TDim, typename TIdx, typename... TArgs>
63 __global__ void gpuKernel(
64 Vec<TDim, TIdx> const threadElemExtent,
65 TKernelFnObj const kernelFnObj,
66 TArgs... args)
67 {
68 TAcc const acc(threadElemExtent);
69
70// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
71# if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
72 static_assert(
73 std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
74 "The TKernelFnObj is required to return void!");
75# endif
76 kernelFnObj(const_cast<TAcc const&>(acc), args...);
77 }
78# if BOOST_COMP_CLANG
79# pragma clang diagnostic pop
80# endif
81 } // namespace detail
82
83 namespace uniform_cuda_hip
84 {
85 namespace detail
86 {
87 template<typename TDim, typename TIdx>
89 {
90 if constexpr(TDim::value > 0)
91 {
92 for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
93 {
94 if(vec[TDim::value - 1u - i] != 1)
95 {
96 throw std::runtime_error(
97 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
98 "work division extents of the dimensions higher 3 have to be 1!");
99 }
100 }
101 }
102 }
103
104 template<typename TDim, typename TIdx>
106 {
107 dim3 dim(1, 1, 1);
108 if constexpr(TDim::value >= 1)
109 dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
110 if constexpr(TDim::value >= 2)
111 dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
112 if constexpr(TDim::value >= 3)
113 dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
114 checkVecOnly3Dim(vec);
115 return dim;
116 }
117 } // namespace detail
118 } // namespace uniform_cuda_hip
119
120 //! The GPU CUDA/HIP accelerator execution task.
121 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
122 class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
123 {
124 public:
125 template<typename TWorkDiv>
127 TWorkDiv&& workDiv,
128 TKernelFnObj const& kernelFnObj,
129 TArgs&&... args)
130 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
131 , m_kernelFnObj(kernelFnObj)
132 , m_args(std::forward<TArgs>(args)...)
133 {
134 static_assert(
135 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
136 "The work division and the execution task have to be of the same dimensionality!");
137 }
138
139 TKernelFnObj m_kernelFnObj;
140 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
141 };
142
143 namespace trait
144 {
145 //! The GPU CUDA/HIP execution task accelerator type trait specialization.
146 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
147 struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
148 {
150 };
151
152 //! The GPU CUDA/HIP execution task device type trait specialization.
153 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
154 struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
155 {
157 };
158
159 //! The GPU CUDA/HIP execution task dimension getter trait specialization.
160 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
161 struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
162 {
163 using type = TDim;
164 };
165
166 //! The CPU CUDA/HIP execution task platform type trait specialization.
167 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
168 struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
169 {
171 };
172
173 //! The GPU CUDA/HIP execution task idx type trait specialization.
174 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
175 struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
176 {
177 using type = TIdx;
178 };
179
180 //! The CUDA/HIP kernel enqueue trait specialization.
181 template<
182 typename TApi,
183 bool TBlocking,
184 typename TAcc,
185 typename TDim,
186 typename TIdx,
187 typename TKernelFnObj,
188 typename... TArgs>
189 struct Enqueue<
190 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
191 TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
192 {
196 {
198 // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
199
200# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
201 // std::size_t printfFifoSize;
202 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
203 // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
204 // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
205 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
206 // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
207# endif
208 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
209 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
210 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
211
212 dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
213 dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
215
216# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
217 std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
218 std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
219 << ")\n";
220# endif
221
222# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
223 // This checks for a valid work division that is also compliant with the hardware maxima of the
224 // accelerator.
225 if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
226 {
227 throw std::runtime_error(
228 "The given work division is not valid or not supported by the device of type "
230 }
231# endif
232
233 // Get the size of the block shared dynamic memory.
234 auto const blockSharedMemDynSizeBytes = std::apply(
235 [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
236 return getBlockSharedMemDynSizeBytes<TAcc>(
237 task.m_kernelFnObj,
238 blockThreadExtent,
239 threadElemExtent,
240 args...);
241 },
242 task.m_args);
243
244# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
245 // Log the block shared memory idx.
246 std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
247 << std::endl;
248# endif
249
250 auto kernelName = alpaka::detail::
251 gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
252
253# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
254 // Log the function attributes.
255 typename TApi::FuncAttributes_t funcAttrs;
256 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
257 std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
258 << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
259 << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
260 << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
261 << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
262 << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
263# endif
264
265 // Set the current device.
266 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
267
268 // Enqueue the kernel execution.
269 // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
270 // language extension expects the arguments by value. This forces the type of a float argument given
271 // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
272 // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
273 // value location.
274 std::apply(
275 [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
276 {
277 kernelName<<<
278 gridDim,
279 blockDim,
280 static_cast<std::size_t>(blockSharedMemDynSizeBytes),
281 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
282 },
283 task.m_args);
284
285 if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
286 {
287 // Wait for the kernel execution to finish but do not check error return of this call.
288 // Do not use the alpaka::wait method because it checks the error itself but we want to give a
289 // custom error message.
290 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
291 }
292 if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
293 {
294 auto const msg
295 = std::string{"execution of kernel '" + core::demangled<TKernelFnObj> + "' failed with"};
296 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
297 }
298 }
299 };
300
301 //! \brief Specialisation of the class template FunctionAttributes
302 //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
303 //! \tparam TDim The dimensionality of the accelerator device properties.
304 //! \tparam TIdx The idx type of the accelerator device properties.
305 //! \tparam TKernelFn Kernel function object type.
306 //! \tparam TArgs Kernel function object argument types as a parameter pack.
307 template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
308 struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
309 {
310 //! \param dev The device instance
311 //! \param kernelFn The kernel function object which should be executed.
312 //! \param args The kernel invocation arguments.
313 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
314 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
316 [[maybe_unused]] TDev const& dev,
317 [[maybe_unused]] TKernelFn const& kernelFn,
318 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
319 {
320 auto kernelName = alpaka::detail::gpuKernel<
321 TKernelFn,
322 TApi,
324 TDim,
325 TIdx,
327
328 typename TApi::FuncAttributes_t funcAttrs;
329# if BOOST_COMP_GNUC
330 // Disable and enable compile warnings for gcc
331# pragma GCC diagnostic push
332# pragma GCC diagnostic ignored "-Wconditionally-supported"
333# endif
335 TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
336# if BOOST_COMP_GNUC
337# pragma GCC diagnostic pop
338# endif
339
340 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
341 kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
342 kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
343 kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
344 kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
345 kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
346 kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
347 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
348
349# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
350 printf("Kernel Function Attributes: \n");
351 printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
352 printf(
353 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
354 "\n",
355 funcAttrs.constSizeBytes,
356 funcAttrs.localSizeBytes,
357 funcAttrs.sharedSizeBytes,
358 funcAttrs.maxDynamicSharedSizeBytes);
359
360 printf(
361 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
362 funcAttrs.numRegs,
363 funcAttrs.ptxVersion,
364 funcAttrs.maxThreadsPerBlock);
365# endif
366 return kernelFunctionAttributes;
367 }
368 };
369 } // namespace trait
370} // namespace alpaka
371
372# endif
373
374#endif
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
Definition Debug.hpp:16
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition Debug.hpp:22
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
std::tuple< remove_restrict_t< std::decay_t< TArgs > >... > m_args
ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A n-dimensional vector.
Definition Vec.hpp:38
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
__global__ void gpuKernel(Vec< TDim, TIdx > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec< TDim, TIdx > const &vec) -> dim3
ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec< TDim, TIdx > const &vec) -> void
The alpaka accelerator library.
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68
ALPAKA_FN_HOST auto getAccName() -> std::string
Definition Traits.hpp:100
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The accelerator type trait.
Definition Traits.hpp:37
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TaskKernelGpuUniformCudaHipRt< TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
The queue enqueue trait.
Definition Traits.hpp:27
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFn const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition Traits.hpp:79
The idx type trait.
Definition Traits.hpp:25
The platform type trait.
Definition Traits.hpp:30