27# pragma clang diagnostic push
28# pragma clang diagnostic ignored "-Wswitch-default"
50 template<
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
57 T divisor = maxDivisor;
58 while(dividend % divisor != 0)
66 template<
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
69 std::set<T> divisorSet;
75 for(T i(1); i <= std::min(val, maxDivisor); ++i)
79 divisorSet.insert(
static_cast<T
>(val / i));
91 template<
typename TDim,
typename TIdx>
95 if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
96 || (accDevProps.m_threadElemCountMax < 1))
102 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
103 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
104 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
107 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
110 if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
134 template<
typename TDim,
typename TIdx>
139 TIdx kernelBlockThreadCountMax =
static_cast<TIdx
>(0u),
140 bool blockThreadMustDivideGridThreadExtent =
true,
145 using DimLoopInd =
typename TDim::value_type;
147 for(DimLoopInd i(0); i < TDim::value; ++i)
158 auto clippedThreadElemExtent =
elementwise_min(threadElemExtent, gridElemExtent);
159 auto const gridThreadExtent = [&]
162 for(DimLoopInd i(0u); i < TDim::value; ++i)
163 r[i] =
core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
164#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
165# pragma clang diagnostic push
166# pragma clang diagnostic ignored "-Wnrvo"
169#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
170# pragma clang diagnostic pop
185 blockThreadExtent =
Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
188 TIdx
const& blockThreadCountMax
195 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
197 blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
201 if(blockThreadCountMax == 1)
205 else if(blockThreadExtent.prod() > blockThreadCountMax)
207#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
208# pragma clang diagnostic push
209# pragma clang diagnostic ignored "-Wswitch-default"
211 switch(gridBlockExtentSubDivRestrictions)
218 while(blockThreadExtent.prod() > blockThreadCountMax)
219 blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
223 while(blockThreadExtent.prod() > blockThreadCountMax)
225 auto const it = std::min_element(
226 blockThreadExtent.begin(),
227 blockThreadExtent.end() - 1,
228 [](TIdx
const& a, TIdx
const& b)
240#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
241# pragma clang diagnostic pop
247 if(blockThreadMustDivideGridThreadExtent)
249#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
250# pragma clang diagnostic push
251# pragma clang diagnostic ignored "-Wswitch-default"
253 switch(gridBlockExtentSubDivRestrictions)
260 std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
261 for(DimLoopInd i(0u); i < TDim::value; ++i)
263 gridThreadExtentDivisors[i]
267 std::set<TIdx> intersects[2u];
268 for(DimLoopInd i(1u); i < TDim::value; ++i)
270 intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
271 intersects[(i) % 2u].clear();
273 std::begin(intersects[(i - 1u) % 2u]),
274 std::end(intersects[(i - 1u) % 2u]),
275 std::begin(gridThreadExtentDivisors[i]),
276 std::end(gridThreadExtentDivisors[i]),
277 std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
279 TIdx
const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
280 blockThreadExtent =
Vec::all(maxCommonDivisor);
286 for(DimLoopInd i(0u); i < TDim::value; ++i)
292#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
293# pragma clang diagnostic pop
298 auto gridBlockExtent = [&]
301 for(DimLoopInd i = 0; i < TDim::value; ++i)
302 r[i] =
core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
303#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
304# pragma clang diagnostic push
305# pragma clang diagnostic ignored "-Wnrvo"
308#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
309# pragma clang diagnostic pop
320 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
323 if(gridBlockExtentMax[i] < gridBlockExtent[i])
325 gridBlockExtent[i] = gridBlockExtentMax[i];
327 if(blockThreadExtentMax[i] < blockThreadExtent[i])
329 blockThreadExtent[i] = blockThreadExtentMax[i];
331 if(threadElemExtentMax[i] < threadElemExtent[i])
333 clippedThreadElemExtent[i] = threadElemExtentMax[i];
337 return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
358 bool blockThreadMustDivideGridThreadExtent =
true;
361 = GridBlockExtentSubDivRestrictions::Unrestricted;
365 "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
368 "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
370 std::is_same_v<Idx<TGridElemExtent>,
Idx<TAcc>>,
371 "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
373 std::is_same_v<Idx<TThreadElemExtent>,
Idx<TAcc>>,
374 "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
387 typename TGridElemExtent,
388 typename TThreadElemExtent,
389 typename TKernelFnObj,
393 [[maybe_unused]] TDev
const& dev,
394 TKernelFnObj
const& kernelFnObj,
403 auto const kernelFunctionAttributes
404 = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
405 auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
418 getAccDevProps<Acc>(dev),
419 static_cast<Idx<Acc>>(threadsPerBlock),
420 kernelCfg.blockThreadMustDivideGridThreadExtent,
421 kernelCfg.gridBlockExtentSubDivRestrictions);
435 template<
typename TWorkDiv,
typename TDim,
typename TIdx>
439 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
440 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
441 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
468 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
469 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
470 || (threadElemExtentMax[i] < threadElemExtent[i]))
492 template<
typename TAcc,
typename TWorkDiv,
typename TDim,
typename TIdx>
494 TWorkDiv
const& workDiv,
499 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
500 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
501 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
503 auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
505 auto const allowedThreadsPerBlock = std::min(
506 static_cast<TIdx
>(threadsPerBlockForKernel),
513 if(allowedThreadsPerBlock < blockThreadExtent.prod())
533 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
534 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
535 || (threadElemExtentMax[i] < threadElemExtent[i]))
555 template<
typename TAcc,
typename TWorkDiv,
typename TDev,
typename TKernelFnObj,
typename... TArgs>
557 TWorkDiv
const& workDiv,
559 TKernelFnObj
const& kernelFnObj,
560 TArgs&&... args) ->
bool
562 return isValidWorkDiv<TAcc>(
564 getAccDevProps<TAcc>(dev),
565 getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
574 template<
typename TAcc,
typename TWorkDiv,
typename TDev>
582# pragma clang diagnostic pop
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
A basic class holding the work division as grid block extent, block thread and thread element extent.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const ÷nd, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc > >
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, AccDevProps< TDim, TIdx > const &accDevProps) -> bool
Checks if the work division is supported.
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T > >
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...