30 #include <type_traits>
31 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
35 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
37 # if !defined(ALPAKA_HOST_ONLY)
41 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
42 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
45 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
46 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
54 # pragma clang diagnostic push
55 # pragma clang diagnostic ignored "-Wunused-template"
61 template<
typename TKernelFnObj,
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename... TArgs>
64 TKernelFnObj
const kernelFnObj,
67 TAcc
const acc(threadElemExtent);
70 # if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
72 std::is_same_v<decltype(kernelFnObj(
const_cast<TAcc const&
>(acc), args...)),
void>,
73 "The TKernelFnObj is required to return void!");
75 kernelFnObj(
const_cast<TAcc const&
>(acc), args...);
78 # pragma clang diagnostic pop
82 namespace uniform_cuda_hip
86 template<
typename TDim,
typename TIdx>
89 if constexpr(TDim::value > 0)
91 for(
auto i =
std::min(
typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
93 if(vec[TDim::value - 1u - i] != 1)
95 throw std::runtime_error(
96 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
97 "work division extents of the dimensions higher 3 have to be 1!");
103 template<
typename TDim,
typename TIdx>
107 if constexpr(TDim::value >= 1)
108 dim.x =
static_cast<unsigned>(vec[TDim::value - 1u]);
109 if constexpr(TDim::value >= 2)
110 dim.y =
static_cast<unsigned>(vec[TDim::value - 2u]);
111 if constexpr(TDim::value >= 3)
112 dim.z =
static_cast<unsigned>(vec[TDim::value - 3u]);
120 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
124 template<
typename TWorkDiv>
127 TKernelFnObj
const& kernelFnObj,
131 ,
m_args(std::forward<TArgs>(args)...)
134 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
135 "The work division and the execution task have to be of the same dimensionality!");
139 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...>
m_args;
145 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
152 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
159 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
166 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
173 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
186 typename TKernelFnObj,
189 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
199 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
207 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
208 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
209 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
215 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
216 std::cout << __func__ <<
" gridDim: (" << gridDim.z <<
", " << gridDim.y <<
", " << gridDim.x <<
")\n";
217 std::cout << __func__ <<
" blockDim: (" << blockDim.z <<
", " << blockDim.y <<
", " << blockDim.x
221 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
224 if(!isValidWorkDiv<TAcc>(task,
getDev(queue)))
226 throw std::runtime_error(
227 "The given work division is not valid or not supported by the device of type "
233 auto const blockSharedMemDynSizeBytes = std::apply(
235 return getBlockSharedMemDynSizeBytes<TAcc>(
243 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
245 std::cout << __func__ <<
" BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
249 auto kernelName = alpaka::detail::
250 gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
252 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
254 typename TApi::FuncAttributes_t funcAttrs;
256 std::cout << __func__ <<
" binaryVersion: " << funcAttrs.binaryVersion
257 <<
" constSizeBytes: " << funcAttrs.constSizeBytes <<
" B"
258 <<
" localSizeBytes: " << funcAttrs.localSizeBytes <<
" B"
259 <<
" maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
260 <<
" numRegs: " << funcAttrs.numRegs <<
" ptxVersion: " << funcAttrs.ptxVersion
261 <<
" sharedSizeBytes: " << funcAttrs.sharedSizeBytes <<
" B" << std::endl;
279 static_cast<std::size_t
>(blockSharedMemDynSizeBytes),
280 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
289 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
294 = std::string{
"execution of kernel '" + core::demangled<TKernelFnObj> +
"' failed with"};
295 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
306 template<
typename TApi,
typename TDev,
typename TDim,
typename TIdx,
typename TKernelFn,
typename... TArgs>
315 [[maybe_unused]] TDev
const& dev,
316 [[maybe_unused]] TKernelFn
const& kernelFn,
327 typename TApi::FuncAttributes_t funcAttrs;
330 # pragma GCC diagnostic push
331 # pragma GCC diagnostic ignored "-Wconditionally-supported"
334 TApi::funcGetAttributes(&funcAttrs,
reinterpret_cast<void const*
>(kernelName)));
336 # pragma GCC diagnostic pop
340 kernelFunctionAttributes.
constSizeBytes = funcAttrs.constSizeBytes;
341 kernelFunctionAttributes.
localSizeBytes = funcAttrs.localSizeBytes;
344 kernelFunctionAttributes.
numRegs = funcAttrs.numRegs;
345 kernelFunctionAttributes.
asmVersion = funcAttrs.ptxVersion;
346 kernelFunctionAttributes.
maxThreadsPerBlock =
static_cast<int>(funcAttrs.maxThreadsPerBlock);
348 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
349 printf(
"Kernel Function Attributes: \n");
350 printf(
"binaryVersion: %d \n", funcAttrs.binaryVersion);
352 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
354 funcAttrs.constSizeBytes,
355 funcAttrs.localSizeBytes,
356 funcAttrs.sharedSizeBytes,
357 funcAttrs.maxDynamicSharedSizeBytes);
360 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
362 funcAttrs.ptxVersion,
363 funcAttrs.maxThreadsPerBlock);
365 return kernelFunctionAttributes;
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
A basic class holding the work division as grid block extent, block thread and thread element extent.
__global__ void gpuKernel(Vec< TDim, TIdx > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
The alpaka accelerator library.
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
ALPAKA_FN_HOST auto getAccName() -> std::string
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
std::size_t constSizeBytes
std::size_t localSizeBytes
std::size_t sharedSizeBytes
int maxDynamicSharedSizeBytes
The accelerator type trait.
The dimension getter type trait.
The structure template to access to the functions attributes of a kernel function object.