32#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
36#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
38# if !defined(ALPAKA_HOST_ONLY)
42# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
43# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
46# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
47# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
55# pragma clang diagnostic push
56# pragma clang diagnostic ignored "-Wunused-template"
62 template<
typename TKernelFnObj,
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename... TArgs>
65 TKernelFnObj
const kernelFnObj,
68 TAcc
const acc(threadElemExtent);
71# if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
73 std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)),
void>,
74 "The TKernelFnObj is required to return void!");
76 kernelFnObj(
const_cast<TAcc const&
>(acc), args...);
79# pragma clang diagnostic pop
83 namespace uniform_cuda_hip
87 template<
typename TDim,
typename TIdx>
90 if constexpr(TDim::value > 0)
92 for(
auto i = std::min(
typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
94 if(vec[TDim::value - 1u - i] != 1)
96 throw std::runtime_error(
97 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
98 "work division extents of the dimensions higher 3 have to be 1!");
104 template<
typename TDim,
typename TIdx>
108 if constexpr(TDim::value >= 1)
109 dim.x =
static_cast<unsigned>(vec[TDim::value - 1u]);
110 if constexpr(TDim::value >= 2)
111 dim.y =
static_cast<unsigned>(vec[TDim::value - 2u]);
112 if constexpr(TDim::value >= 3)
113 dim.z =
static_cast<unsigned>(vec[TDim::value - 3u]);
121 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
125 template<
typename TWorkDiv>
128 TKernelFnObj
const& kernelFnObj,
136 "The work division and the execution task have to be of the same dimensionality!");
140 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...>
m_args;
146 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
153 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
160 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
167 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
174 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
187 typename TKernelFnObj,
190 uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
200# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
208 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
209 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
210 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
216# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
217 std::cout << __func__ <<
" gridDim: (" << gridDim.z <<
", " << gridDim.y <<
", " << gridDim.x <<
")\n";
218 std::cout << __func__ <<
" blockDim: (" << blockDim.z <<
", " << blockDim.y <<
", " << blockDim.x
222# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
225 if(!isValidWorkDiv<TAcc>(task,
getDev(queue)))
227 throw std::runtime_error(
228 "The given work division is not valid or not supported by the device of type "
234 auto const blockSharedMemDynSizeBytes = std::apply(
236 return getBlockSharedMemDynSizeBytes<TAcc>(
244# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
246 std::cout << __func__ <<
" BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
250 auto kernelName = alpaka::detail::
251 gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
253# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
255 typename TApi::FuncAttributes_t funcAttrs;
257 std::cout << __func__ <<
" binaryVersion: " << funcAttrs.binaryVersion
258 <<
" constSizeBytes: " << funcAttrs.constSizeBytes <<
" B"
259 <<
" localSizeBytes: " << funcAttrs.localSizeBytes <<
" B"
260 <<
" maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
261 <<
" numRegs: " << funcAttrs.numRegs <<
" ptxVersion: " << funcAttrs.ptxVersion
262 <<
" sharedSizeBytes: " << funcAttrs.sharedSizeBytes <<
" B" << std::endl;
280 static_cast<std::size_t
>(blockSharedMemDynSizeBytes),
281 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
290 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
296 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
307 template<
typename TApi,
typename TDev,
typename TDim,
typename TIdx,
typename TKernelFn,
typename... TArgs>
316 [[maybe_unused]] TDev
const& dev,
317 [[maybe_unused]] TKernelFn
const& kernelFn,
328 typename TApi::FuncAttributes_t funcAttrs;
331# pragma GCC diagnostic push
332# pragma GCC diagnostic ignored "-Wconditionally-supported"
335 TApi::funcGetAttributes(&funcAttrs,
reinterpret_cast<void const*
>(kernelName)));
337# pragma GCC diagnostic pop
341 kernelFunctionAttributes.
constSizeBytes = funcAttrs.constSizeBytes;
342 kernelFunctionAttributes.
localSizeBytes = funcAttrs.localSizeBytes;
345 kernelFunctionAttributes.
numRegs = funcAttrs.numRegs;
346 kernelFunctionAttributes.
asmVersion = funcAttrs.ptxVersion;
347 kernelFunctionAttributes.
maxThreadsPerBlock =
static_cast<int>(funcAttrs.maxThreadsPerBlock);
349# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
350 printf(
"Kernel Function Attributes: \n");
351 printf(
"binaryVersion: %d \n", funcAttrs.binaryVersion);
353 "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu maxDynamicSharedSizeBytes: %d "
355 funcAttrs.constSizeBytes,
356 funcAttrs.localSizeBytes,
357 funcAttrs.sharedSizeBytes,
358 funcAttrs.maxDynamicSharedSizeBytes);
361 "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
363 funcAttrs.ptxVersion,
364 funcAttrs.maxThreadsPerBlock);
366 return kernelFunctionAttributes;
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
A basic class holding the work division as grid block extent, block thread and thread element extent.
auto clipCast(V const &val) -> T
__global__ void gpuKernel(Vec< TDim, TIdx > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
The alpaka accelerator library.
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
ALPAKA_FN_HOST auto getAccName() -> std::string
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
std::size_t constSizeBytes
std::size_t localSizeBytes
std::size_t sharedSizeBytes
int maxDynamicSharedSizeBytes
The accelerator type trait.
The dimension getter type trait.
The structure template to access to the functions attributes of a kernel function object.