32#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
36#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
38# if !defined(ALPAKA_HOST_ONLY)
42# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
43# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
46# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
47# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
55# pragma clang diagnostic push
56# pragma clang diagnostic ignored "-Wunused-template"
62 template<
typename TKernelFnObj,
typename TAcc,
typename... TArgs>
65 TKernelFnObj
const kernelFnObj,
68 TAcc
const acc(threadElemExtent);
71# if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
73 std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)),
void>,
74 "The TKernelFnObj is required to return void!");
76 kernelFnObj(
const_cast<TAcc const&
>(acc), args...);
79# pragma clang diagnostic pop
82 template<
typename TKernelFnObj,
typename TAcc,
typename... TArgs>
87 =
gpuKernel<TKernelFnObj, TAcc, TArgs...>;
91 namespace uniform_cuda_hip
95 template<
typename TDim,
typename TIdx>
98 if constexpr(TDim::value > 0)
100 for(
auto i = std::min(
typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
102 if(vec[TDim::value - 1u - i] != 1)
104 throw std::runtime_error(
105 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
106 "work division extents of the dimensions higher 3 have to be 1!");
112 template<
typename TDim,
typename TIdx>
116 if constexpr(TDim::value >= 1)
117 dim.x =
static_cast<unsigned>(vec[TDim::value - 1u]);
118 if constexpr(TDim::value >= 2)
119 dim.y =
static_cast<unsigned>(vec[TDim::value - 2u]);
120 if constexpr(TDim::value >= 3)
121 dim.z =
static_cast<unsigned>(vec[TDim::value - 3u]);
129 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
133 template<
typename TWorkDiv>
136 TKernelFnObj
const& kernelFnObj,
144 "The work division and the execution task have to be of the same dimensionality!");
148 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...>
m_args;
154 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
161 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
168 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
175 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
182 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
195 typename TKernelFnObj,
198 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
208# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
216 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
217 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
218 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
224# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
225 std::cout << __func__ <<
" gridDim: (" << gridDim.z <<
", " << gridDim.y <<
", " << gridDim.x <<
")\n";
226 std::cout << __func__ <<
" blockDim: (" << blockDim.z <<
", " << blockDim.y <<
", " << blockDim.x
230# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
233 if(!isValidWorkDiv<TAcc>(task,
getDev(queue)))
235 throw std::runtime_error(
236 "The given work division is not valid or not supported by the device of type "
242 auto const blockSharedMemDynSizeBytes = std::apply(
244 return getBlockSharedMemDynSizeBytes<TAcc>(
252# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
254 std::cout << __func__ <<
" BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
259 = alpaka::detail::kernelName<TKernelFnObj, TAcc, remove_restrict_t<std::decay_t<TArgs>>...>;
261# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
263 typename TApi::FuncAttributes_t funcAttrs;
265 std::cout << __func__ <<
" binaryVersion: " << funcAttrs.binaryVersion
266 <<
" constSizeBytes: " << funcAttrs.constSizeBytes <<
" B"
267 <<
" localSizeBytes: " << funcAttrs.localSizeBytes <<
" B"
268 <<
" maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
269 <<
" numRegs: " << funcAttrs.numRegs <<
" ptxVersion: " << funcAttrs.ptxVersion
270 <<
" sharedSizeBytes: " << funcAttrs.sharedSizeBytes <<
" B" << std::endl;
288 static_cast<std::size_t
>(blockSharedMemDynSizeBytes),
289 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
298 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
304 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
315 template<
typename TApi,
typename TDev,
typename TDim,
typename TIdx,
typename TKernelFn,
typename... TArgs>
324 [[maybe_unused]] TDev
const& dev,
325 [[maybe_unused]] TKernelFn
const& kernelFn,
333 typename TApi::FuncAttributes_t funcAttrs;
336# pragma GCC diagnostic push
337# pragma GCC diagnostic ignored "-Wconditionally-supported"
340 TApi::funcGetAttributes(&funcAttrs,
reinterpret_cast<void const*
>(kernelName)));
342# pragma GCC diagnostic pop
346 kernelFunctionAttributes.
constSizeBytes = funcAttrs.constSizeBytes;
347 kernelFunctionAttributes.
localSizeBytes = funcAttrs.localSizeBytes;
350 kernelFunctionAttributes.
numRegs = funcAttrs.numRegs;
351 kernelFunctionAttributes.
asmVersion = funcAttrs.ptxVersion;
352 kernelFunctionAttributes.
maxThreadsPerBlock =
static_cast<int>(funcAttrs.maxThreadsPerBlock);
354# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
355 printf(
"Kernel Function Attributes: \n");
356 printf(
"binaryVersion: %d \n", funcAttrs.binaryVersion);
358 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
360 funcAttrs.constSizeBytes,
361 funcAttrs.localSizeBytes,
362 funcAttrs.sharedSizeBytes,
363 funcAttrs.maxDynamicSharedSizeBytes);
366 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
368 funcAttrs.ptxVersion,
369 funcAttrs.maxThreadsPerBlock);
371 return kernelFunctionAttributes;
383struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace;
385# define ALPAKA_KERNEL_NAME(KERNEL, NAME) \
387 struct The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace; \
391 The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace, \
392 ::The_ALPAKA_KERNEL_NAME_macro_must_be_called_in_the_global_namespace>, \
393 "The ALPAKA_KERNEL_NAME macro must be called in the global namespace"); \
395 template<typename TAcc, typename... TArgs> \
396 __global__ void NAME( \
397 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
398 KERNEL const kernelFnObj, \
401 TAcc const acc(extent); \
402 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
405 namespace alpaka::detail \
407 template<typename TAcc, typename... TArgs> \
408 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
409 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
412 = ::NAME<TAcc, TArgs...>; \
415struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace;
417# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME) \
418 struct The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace; \
422 The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace, \
423 ::The_ALPAKA_KERNEL_SCOPED_NAME_macro_must_be_called_in_the_global_namespace>, \
424 "The ALPAKA_KERNEL_SCOPED_NAME macro must be called in the global namespace"); \
426 namespace NAMESPACE \
428 template<typename TAcc, typename... TArgs> \
429 __global__ void NAME( \
430 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const extent, \
431 KERNEL const kernelFnObj, \
434 TAcc const acc(extent); \
435 kernelFnObj(const_cast<TAcc const&>(acc), args...); \
439 namespace alpaka::detail \
441 template<typename TAcc, typename... TArgs> \
442 inline void (*kernelName<KERNEL, TAcc, TArgs...>)( \
443 alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const, \
446 = ::NAMESPACE::NAME<TAcc, TArgs...>; \
454# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
455# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
463# define ALPAKA_KERNEL_NAME(KERNEL, NAME)
464# define ALPAKA_KERNEL_SCOPED_NAME(KERNEL, NAMESPACE, NAME)
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
A basic class holding the work division as grid block extent, block thread and thread element extent.
auto clipCast(V const &val) -> T
__global__ void gpuKernel(Vec< Dim< TAcc >, Idx< TAcc > > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
void(* kernelName)(Vec< Dim< TAcc >, Idx< TAcc > > const, TKernelFnObj const, remove_restrict_t< std::decay_t< TArgs > >...)
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
ALPAKA_FN_HOST auto getAccName() -> std::string
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
std::size_t constSizeBytes
std::size_t localSizeBytes
std::size_t sharedSizeBytes
int maxDynamicSharedSizeBytes
The accelerator type trait.
The dimension getter type trait.
The structure template to access to the functions attributes of a kernel function object.