30 #include <type_traits>
31 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
35 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
37 # if !defined(ALPAKA_HOST_ONLY)
41 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
42 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
45 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
46 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
54 # pragma clang diagnostic push
55 # pragma clang diagnostic ignored "-Wunused-template"
61 template<
typename TKernelFnObj,
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename... TArgs>
64 TKernelFnObj
const kernelFnObj,
67 # if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
68 # error "Device capability >= 2.0 is required!"
71 const TAcc acc(threadElemExtent);
74 # if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
76 std::is_same_v<decltype(kernelFnObj(
const_cast<TAcc const&
>(acc), args...)),
void>,
77 "The TKernelFnObj is required to return void!");
79 kernelFnObj(
const_cast<TAcc const&
>(acc), args...);
82 # pragma clang diagnostic pop
86 namespace uniform_cuda_hip
90 template<
typename TDim,
typename TIdx>
93 if constexpr(TDim::value > 0)
95 for(
auto i =
std::min(
typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
97 if(vec[TDim::value - 1u - i] != 1)
99 throw std::runtime_error(
100 "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
101 "work division extents of the dimensions higher 3 have to be 1!");
107 template<
typename TDim,
typename TIdx>
111 if constexpr(TDim::value >= 1)
112 dim.x =
static_cast<unsigned>(vec[TDim::value - 1u]);
113 if constexpr(TDim::value >= 2)
114 dim.y =
static_cast<unsigned>(vec[TDim::value - 2u]);
115 if constexpr(TDim::value >= 3)
116 dim.z =
static_cast<unsigned>(vec[TDim::value - 3u]);
124 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
128 template<
typename TWorkDiv>
131 TKernelFnObj
const& kernelFnObj,
135 ,
m_args(std::forward<TArgs>(args)...)
138 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
139 "The work division and the execution task have to be of the same dimensionality!");
143 std::tuple<remove_restrict_t<std::decay_t<TArgs>>...>
m_args;
149 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
156 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
163 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
170 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
177 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
184 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
196 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
204 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
205 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
206 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
212 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
213 std::cout << __func__ <<
" gridDim: " << gridDim.z <<
" " << gridDim.y <<
" " << gridDim.x
214 <<
" blockDim: " << blockDim.z <<
" " << blockDim.y <<
" " << blockDim.x << std::endl;
217 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
219 if(!isValidWorkDiv<TAcc>(
getDev(queue), task))
221 throw std::runtime_error(
222 "The given work division is not valid or not supported by the device of type "
228 auto const blockSharedMemDynSizeBytes = std::apply(
230 return getBlockSharedMemDynSizeBytes<TAcc>(
238 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
240 std::cout << __func__ <<
" BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
243 auto kernelName = alpaka::detail::
244 gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
246 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
248 typename TApi::FuncAttributes_t funcAttrs;
250 std::cout << __func__ <<
" binaryVersion: " << funcAttrs.binaryVersion
251 <<
" constSizeBytes: " << funcAttrs.constSizeBytes <<
" B"
252 <<
" localSizeBytes: " << funcAttrs.localSizeBytes <<
" B"
253 <<
" maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
254 <<
" numRegs: " << funcAttrs.numRegs <<
" ptxVersion: " << funcAttrs.ptxVersion
255 <<
" sharedSizeBytes: " << funcAttrs.sharedSizeBytes <<
" B" << std::endl;
272 static_cast<std::size_t
>(blockSharedMemDynSizeBytes),
273 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
282 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
283 auto const msg = std::string{
284 "'execution of kernel: '" + std::string{core::demangled<TKernelFnObj>} +
"' failed with"};
285 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
291 template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
303 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
311 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
312 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
313 auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
319 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
320 std::cout << __func__ <<
"gridDim: " << gridDim.z <<
" " << gridDim.y <<
" " << gridDim.x << std::endl;
321 std::cout << __func__ <<
"blockDim: " << blockDim.z <<
" " << blockDim.y <<
" " << blockDim.x
325 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
327 if(!isValidWorkDiv<TAcc>(
getDev(queue), task))
329 throw std::runtime_error(
330 "The given work division is not valid or not supported by the device of type "
336 auto const blockSharedMemDynSizeBytes = std::apply(
338 return getBlockSharedMemDynSizeBytes<TAcc>(
346 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
348 std::cout << __func__ <<
" BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
352 auto kernelName = alpaka::detail::
353 gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
354 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
356 typename TApi::FuncAttributes_t funcAttrs;
358 std::cout << __func__ <<
" binaryVersion: " << funcAttrs.binaryVersion
359 <<
" constSizeBytes: " << funcAttrs.constSizeBytes <<
" B"
360 <<
" localSizeBytes: " << funcAttrs.localSizeBytes <<
" B"
361 <<
" maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
362 <<
" numRegs: " << funcAttrs.numRegs <<
" ptxVersion: " << funcAttrs.ptxVersion
363 <<
" sharedSizeBytes: " << funcAttrs.sharedSizeBytes <<
" B" << std::endl;
376 static_cast<std::size_t
>(blockSharedMemDynSizeBytes),
377 queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
384 std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
388 = std::string{
"'execution of kernel: '" + core::demangled<TKernelFnObj> +
"' failed with"};
389 ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
#define ALPAKA_DEBUG_MINIMAL
The minimal debug level.
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
A basic class holding the work division as grid block extent, block thread and thread element extent.
__global__ void gpuKernel(Vec< TDim, TIdx > const threadElemExtent, TKernelFnObj const kernelFnObj, TArgs... args)
The GPU CUDA/HIP kernel entry point.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
The alpaka accelerator library.
typename remove_restrict< T >::type remove_restrict_t
Helper to remove restrict from a type.
ALPAKA_FN_HOST auto getDev(T const &t)
ALPAKA_FN_HOST auto getAccName() -> std::string
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
The accelerator type trait.
The dimension getter type trait.