23 #include <type_traits>
26 # pragma clang diagnostic push
27 # pragma clang diagnostic ignored "-Wswitch-default"
49 template<
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
56 T divisor = maxDivisor;
57 while(dividend % divisor != 0)
65 template<
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
68 std::set<T> divisorSet;
74 for(T i(1); i <=
std::min(val, maxDivisor); ++i)
78 divisorSet.insert(
static_cast<T
>(val / i));
90 template<
typename TDim,
typename TIdx>
94 if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
95 || (accDevProps.m_threadElemCountMax < 1))
101 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
102 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
103 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
106 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
109 if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
133 template<
typename TDim,
typename TIdx>
138 TIdx kernelBlockThreadCountMax =
static_cast<TIdx
>(0u),
139 bool blockThreadMustDivideGridThreadExtent =
true,
144 using DimLoopInd =
typename TDim::value_type;
146 for(DimLoopInd i(0); i < TDim::value; ++i)
157 auto clippedThreadElemExtent =
elementwise_min(threadElemExtent, gridElemExtent);
158 auto const gridThreadExtent = [&]
161 for(DimLoopInd i(0u); i < TDim::value; ++i)
162 r[i] =
core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
177 blockThreadExtent =
Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
180 TIdx
const& blockThreadCountMax
187 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
189 blockThreadExtent[i] =
std::min(blockThreadExtent[i], blockThreadCountMax);
193 if(blockThreadCountMax == 1)
197 else if(blockThreadExtent.prod() > blockThreadCountMax)
199 switch(gridBlockExtentSubDivRestrictions)
206 while(blockThreadExtent.prod() > blockThreadCountMax)
207 blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
211 while(blockThreadExtent.prod() > blockThreadCountMax)
213 auto const it = std::min_element(
214 blockThreadExtent.begin(),
215 blockThreadExtent.end() - 1,
216 [](TIdx
const& a, TIdx
const& b)
232 if(blockThreadMustDivideGridThreadExtent)
234 switch(gridBlockExtentSubDivRestrictions)
241 std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
242 for(DimLoopInd i(0u); i < TDim::value; ++i)
244 gridThreadExtentDivisors[i]
248 std::set<TIdx> intersects[2u];
249 for(DimLoopInd i(1u); i < TDim::value; ++i)
251 intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
252 intersects[(i) % 2u].clear();
255 std::end(intersects[(i - 1u) % 2u]),
257 std::end(gridThreadExtentDivisors[i]),
258 std::inserter(intersects[i % 2],
std::begin(intersects[i % 2u])));
260 TIdx
const maxCommonDivisor = *(--
std::end(intersects[(TDim::value - 1) % 2u]));
261 blockThreadExtent =
Vec::all(maxCommonDivisor);
267 for(DimLoopInd i(0u); i < TDim::value; ++i)
276 auto gridBlockExtent = [&]
279 for(DimLoopInd i = 0; i < TDim::value; ++i)
280 r[i] =
core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
291 for(
typename TDim::value_type i(0); i < TDim::value; ++i)
294 if(gridBlockExtentMax[i] < gridBlockExtent[i])
296 gridBlockExtent[i] = gridBlockExtentMax[i];
298 if(blockThreadExtentMax[i] < blockThreadExtent[i])
300 blockThreadExtent[i] = blockThreadExtentMax[i];
302 if(threadElemExtentMax[i] < threadElemExtent[i])
304 clippedThreadElemExtent[i] = threadElemExtentMax[i];
308 return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
329 bool blockThreadMustDivideGridThreadExtent =
true;
332 = GridBlockExtentSubDivRestrictions::Unrestricted;
336 "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
339 "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
342 "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
345 "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
358 typename TGridElemExtent,
359 typename TThreadElemExtent,
360 typename TKernelFnObj,
364 [[maybe_unused]] TDev
const& dev,
365 TKernelFnObj
const& kernelFnObj,
374 auto const kernelFunctionAttributes
375 = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
376 auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
389 getAccDevProps<Acc>(dev),
390 static_cast<Idx<Acc>>(threadsPerBlock),
391 kernelCfg.blockThreadMustDivideGridThreadExtent,
392 kernelCfg.gridBlockExtentSubDivRestrictions);
406 template<
typename TWorkDiv,
typename TDim,
typename TIdx>
410 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
411 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
412 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
439 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
440 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
441 || (threadElemExtentMax[i] < threadElemExtent[i]))
463 template<
typename TAcc,
typename TWorkDiv,
typename TDim,
typename TIdx>
465 TWorkDiv
const& workDiv,
470 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
471 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
472 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
474 auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
476 auto const allowedThreadsPerBlock =
std::min(
477 static_cast<TIdx
>(threadsPerBlockForKernel),
484 if(allowedThreadsPerBlock < blockThreadExtent.prod())
504 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
505 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
506 || (threadElemExtentMax[i] < threadElemExtent[i]))
526 template<
typename TAcc,
typename TWorkDiv,
typename TDev,
typename TKernelFnObj,
typename... TArgs>
528 TWorkDiv
const& workDiv,
530 TKernelFnObj
const& kernelFnObj,
531 TArgs&&... args) ->
bool
533 return isValidWorkDiv<TAcc>(
535 getAccDevProps<TAcc>(dev),
536 getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
545 template<
typename TAcc,
typename TWorkDiv,
typename TDev>
553 # pragma clang diagnostic pop
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
A basic class holding the work division as grid block extent, block thread and thread element extent.
constexpr ALPAKA_FN_HOST_ACC auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
constexpr ALPAKA_FN_HOST_ACC auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const ÷nd, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
ALPAKA_FN_HOST auto end(TView &view) -> Iterator< TView >
ALPAKA_FN_HOST auto begin(TView &view) -> Iterator< TView >
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, TDev const &dev) -> bool
Checks if the work division is supported by the device.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T >>
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, [[maybe_unused]] TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc >>
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex >>
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...