27# pragma clang diagnostic push
28# pragma clang diagnostic ignored "-Wswitch-default"
50 template<
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
57 T divisor = maxDivisor;
58 while(dividend % divisor != 0)
66 template<
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
69 std::set<T> divisorSet;
75 for(T i(1); i <= std::min(val, maxDivisor); ++i)
79 divisorSet.insert(
static_cast<T
>(val / i));
91 template<
typename TDim,
typename TIdx>
95 if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
96 || (accDevProps.m_threadElemCountMax < 1))
102 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
103 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
104 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
107 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
110 if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
134 template<
typename TDim,
typename TIdx>
139 TIdx kernelBlockThreadCountMax =
static_cast<TIdx
>(0u),
140 bool blockThreadMustDivideGridThreadExtent =
true,
145 using DimLoopInd =
typename TDim::value_type;
147 for(DimLoopInd i(0); i < TDim::value; ++i)
158 auto clippedThreadElemExtent =
elementwise_min(threadElemExtent, gridElemExtent);
159 auto const gridThreadExtent = [&]
162 for(DimLoopInd i(0u); i < TDim::value; ++i)
163 r[i] =
core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
178 blockThreadExtent =
Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
181 TIdx
const& blockThreadCountMax
188 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
190 blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
194 if(blockThreadCountMax == 1)
198 else if(blockThreadExtent.prod() > blockThreadCountMax)
200 switch(gridBlockExtentSubDivRestrictions)
207 while(blockThreadExtent.prod() > blockThreadCountMax)
208 blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
212 while(blockThreadExtent.prod() > blockThreadCountMax)
214 auto const it = std::min_element(
215 blockThreadExtent.begin(),
216 blockThreadExtent.end() - 1,
217 [](TIdx
const& a, TIdx
const& b)
233 if(blockThreadMustDivideGridThreadExtent)
235 switch(gridBlockExtentSubDivRestrictions)
242 std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
243 for(DimLoopInd i(0u); i < TDim::value; ++i)
245 gridThreadExtentDivisors[i]
249 std::set<TIdx> intersects[2u];
250 for(DimLoopInd i(1u); i < TDim::value; ++i)
252 intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
253 intersects[(i) % 2u].clear();
255 std::begin(intersects[(i - 1u) % 2u]),
256 std::end(intersects[(i - 1u) % 2u]),
257 std::begin(gridThreadExtentDivisors[i]),
258 std::end(gridThreadExtentDivisors[i]),
259 std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
261 TIdx
const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
262 blockThreadExtent =
Vec::all(maxCommonDivisor);
268 for(DimLoopInd i(0u); i < TDim::value; ++i)
277 auto gridBlockExtent = [&]
280 for(DimLoopInd i = 0; i < TDim::value; ++i)
281 r[i] =
core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
292 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
295 if(gridBlockExtentMax[i] < gridBlockExtent[i])
297 gridBlockExtent[i] = gridBlockExtentMax[i];
299 if(blockThreadExtentMax[i] < blockThreadExtent[i])
301 blockThreadExtent[i] = blockThreadExtentMax[i];
303 if(threadElemExtentMax[i] < threadElemExtent[i])
305 clippedThreadElemExtent[i] = threadElemExtentMax[i];
309 return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
330 bool blockThreadMustDivideGridThreadExtent =
true;
333 = GridBlockExtentSubDivRestrictions::Unrestricted;
337 "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
340 "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
342 std::is_same_v<Idx<TGridElemExtent>,
Idx<TAcc>>,
343 "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
345 std::is_same_v<Idx<TThreadElemExtent>,
Idx<TAcc>>,
346 "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
359 typename TGridElemExtent,
360 typename TThreadElemExtent,
361 typename TKernelFnObj,
365 [[maybe_unused]] TDev
const& dev,
366 TKernelFnObj
const& kernelFnObj,
375 auto const kernelFunctionAttributes
376 = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
377 auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
390 getAccDevProps<Acc>(dev),
391 static_cast<Idx<Acc>>(threadsPerBlock),
392 kernelCfg.blockThreadMustDivideGridThreadExtent,
393 kernelCfg.gridBlockExtentSubDivRestrictions);
407 template<
typename TWorkDiv,
typename TDim,
typename TIdx>
411 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
412 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
413 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
440 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
441 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
442 || (threadElemExtentMax[i] < threadElemExtent[i]))
464 template<
typename TAcc,
typename TWorkDiv,
typename TDim,
typename TIdx>
466 TWorkDiv
const& workDiv,
471 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
472 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
473 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
475 auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
477 auto const allowedThreadsPerBlock = std::min(
478 static_cast<TIdx
>(threadsPerBlockForKernel),
485 if(allowedThreadsPerBlock < blockThreadExtent.prod())
505 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
506 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
507 || (threadElemExtentMax[i] < threadElemExtent[i]))
527 template<
typename TAcc,
typename TWorkDiv,
typename TDev,
typename TKernelFnObj,
typename... TArgs>
529 TWorkDiv
const& workDiv,
531 TKernelFnObj
const& kernelFnObj,
532 TArgs&&... args) ->
bool
534 return isValidWorkDiv<TAcc>(
536 getAccDevProps<TAcc>(dev),
537 getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
546 template<
typename TAcc,
typename TWorkDiv,
typename TDev>
554# pragma clang diagnostic pop
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
A basic class holding the work division as grid block extent, block thread and thread element extent.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const ÷nd, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc > >
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, AccDevProps< TDim, TIdx > const &accDevProps) -> bool
Checks if the work division is supported.
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T > >
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...