33#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
37#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
39# if !defined(ALPAKA_HOST_ONLY)
43# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
44# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
47# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
48# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
56# pragma clang diagnostic push
57# pragma clang diagnostic ignored "-Wunused-template"
63 template<
typename TKernelFnObj,
typename TAcc,
typename... TArgs>
66 TKernelFnObj
const kernelFnObj,
69 TAcc
const acc(threadElemExtent);
72# if !(ALPAKA_COMP_CLANG_CUDA && ALPAKA_COMP_CLANG)
74 std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)),
void>,
75 "The TKernelFnObj is required to return void!");
77 kernelFnObj(
const_cast<TAcc const&
>(acc), args...);
80# pragma clang diagnostic pop
83 template<
typename TKernelFnObj,
typename TAcc,
typename... TArgs>
88 =
gpuKernel<TKernelFnObj, TAcc, TArgs...>;
92 namespace uniform_cuda_hip
96 template<
typename TDim,
typename TIdx>
99 if constexpr(TDim::value > 0)
101 for(
auto i = std::min(
typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
103 if(vec[TDim::value - 1u - i] != 1)
105 throw std::runtime_error(
106 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
107 "work division extents of the dimensions higher 3 have to be 1!");
113 template<
typename TDim,
typename TIdx>
117 if constexpr(TDim::value >= 1)
118 dim.x =
static_cast<unsigned>(vec[TDim::value - 1u]);
119 if constexpr(TDim::value >= 2)
120 dim.y =
static_cast<unsigned>(vec[TDim::value - 2u]);
121 if constexpr(TDim::value >= 3)
122 dim.z =
static_cast<unsigned>(vec[TDim::value - 3u]);
130 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
134 template<
typename TWorkDiv>
137 TKernelFnObj
const& kernelFnObj,
145 "The work division and the execution task have to be of the same dimensionality!");
149 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...>
m_args;
155 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
162 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
169 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
176 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
183 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
196 typename TKernelFnObj,
199 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
209# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
217 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
218 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
219 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
225# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
226 std::cout << __func__ <<
" gridDim: (" << gridDim.z <<
", " << gridDim.y <<
", " << gridDim.x <<
")\n";
227 std::cout << __func__ <<
" blockDim: (" << blockDim.z <<
", " << blockDim.y <<
", " << blockDim.x
231# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
234 if(!isValidWorkDiv<TAcc>(task,
getDev(queue)))
236 throw std::runtime_error(
237 "The given work division is not valid or not supported by the device of type "
243 auto const blockSharedMemDynSizeBytes = std::apply(
245 return getBlockSharedMemDynSizeBytes<TAcc>(
253# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
255 std::cout << __func__ <<
" BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
260 = alpaka::detail::kernelName<TKernelFnObj, TAcc, remove_restrict_t<std::decay_t<TArgs>>...>;
262# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
264 typename TApi::FuncAttributes_t funcAttrs;
266 std::cout << __func__ <<
" binaryVersion: " << funcAttrs.binaryVersion
267 <<
" constSizeBytes: " << funcAttrs.constSizeBytes <<
" B"
268 <<
" localSizeBytes: " << funcAttrs.localSizeBytes <<
" B"
269 <<
" maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
270 <<
" numRegs: " << funcAttrs.numRegs <<
" ptxVersion: " << funcAttrs.ptxVersion
271 <<
" sharedSizeBytes: " << funcAttrs.sharedSizeBytes <<
" B" << std::endl;
289 static_cast<std::size_t
>(blockSharedMemDynSizeBytes),
290 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
299 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
303 using namespace std::literals;
304 std::string
const msg
306 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
317 template<
typename TApi,
typename TDev,
typename TDim,
typename TIdx,
typename TKernelFn,
typename... TArgs>
326 [[maybe_unused]] TDev
const& dev,
327 [[maybe_unused]] TKernelFn
const& kernelFn,
335 typename TApi::FuncAttributes_t funcAttrs;
338# pragma GCC diagnostic push
339# pragma GCC diagnostic ignored "-Wconditionally-supported"
342 TApi::funcGetAttributes(&funcAttrs,
reinterpret_cast<void const*
>(kernelName)));
344# pragma GCC diagnostic pop
348 kernelFunctionAttributes.
constSizeBytes = funcAttrs.constSizeBytes;
349 kernelFunctionAttributes.
localSizeBytes = funcAttrs.localSizeBytes;
352 kernelFunctionAttributes.
numRegs = funcAttrs.numRegs;
353 kernelFunctionAttributes.
asmVersion = funcAttrs.ptxVersion;
354 kernelFunctionAttributes.
maxThreadsPerBlock =
static_cast<int>(funcAttrs.maxThreadsPerBlock);
356# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
357 printf(
"Kernel Function Attributes: \n");
358 printf(
"binaryVersion: %d \n", funcAttrs.binaryVersion);
360 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
362 funcAttrs.constSizeBytes,
363 funcAttrs.localSizeBytes,
364 funcAttrs.sharedSizeBytes,
365 funcAttrs.maxDynamicSharedSizeBytes);
368 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
370 funcAttrs.ptxVersion,
371 funcAttrs.maxThreadsPerBlock);
373 return kernelFunctionAttributes;
385struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace;
387# define ALPAKA_KERNEL_NAME(KERNEL, NAME) \
389 struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace; \
393 The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace, \
394 ::The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace>, \
395 "The ALPAKA_KERNEL_NAME macro must be called in the global namespace"); \
397 template<typename TAcc, typename... TArgs> \
398 __global__ void NAME( \
399 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
400 KERNEL const kernelFnObj, \
403 TAcc const acc(extent); \
404 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
407 namespace alpaka::detail \
409 template<typename TAcc, typename... TArgs> \
410 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
411 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
414 = ::NAME<TAcc, TArgs...>; \
417struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace;
419# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME) \
420 struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace; \
424 The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace, \
425 ::The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace>, \
426 "The ALPAKA_KERNEL_SCOPED_NAME macro must be called in the global namespace"); \
428 namespace NAMESPACE \
430 template<typename TAcc, typename... TArgs> \
431 __global__ void NAME( \
432 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
433 KERNEL const kernelFnObj, \
436 TAcc const acc(extent); \
437 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
441 namespace alpaka::detail \
443 template<typename TAcc, typename... TArgs> \
444 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
445 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
448 = ::NAMESPACE::NAME<TAcc, TArgs...>; \
456# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
457# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
465# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
466# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
A basic class holding the work division as grid block extent, block thread and thread element extent.
auto clipCast(V const &val) -> T
__global__ void gpuKernel(Vec< Dim< TAcc >, Idx< TAcc > > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
void(* kernelName)(Vec< Dim< TAcc >, Idx< TAcc > > const, TKernelFnObj const, remove_restrict_t< std::decay_t< TArgs > >...)
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
ALPAKA_FN_HOST auto getAccName() -> std::string
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
std::size_t constSizeBytes
std::size_t localSizeBytes
std::size_t sharedSizeBytes
int maxDynamicSharedSizeBytes
The accelerator type trait.
The dimension getter type trait.
The structure template to access to the functions attributes of a kernel function object.