alpaka
Abstraction Library for Parallel Kernel Acceleration
WorkDivHelpers.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/acc/Traits.hpp"
8 #include "alpaka/core/Assert.hpp"
9 #include "alpaka/core/Common.hpp"
10 #include "alpaka/core/Utility.hpp"
11 #include "alpaka/dev/Traits.hpp"
12 #include "alpaka/extent/Traits.hpp"
14 #include "alpaka/kernel/Traits.hpp"
15 #include "alpaka/vec/Vec.hpp"
17 
18 #include <algorithm>
19 #include <array>
20 #include <cmath>
21 #include <functional>
22 #include <set>
23 #include <type_traits>
24 
25 #if BOOST_COMP_CLANG
26 # pragma clang diagnostic push
27 # pragma clang diagnostic ignored "-Wswitch-default"
28 #endif
29 
30 //! The alpaka library.
31 namespace alpaka
32 {
33  //! The grid block extent subdivision restrictions.
35  {
36  EqualExtent, //!< The block thread extent will be equal in all dimensions.
37  CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
38  Unrestricted, //!< The block thread extent will not have any restrictions.
39  };
40 
41  namespace detail
42  {
43  //! Finds the largest divisor where divident % divisor == 0
44  //! \param dividend The dividend.
45  //! \param maxDivisor The maximum divisor.
46  //! \return The biggest number that satisfies the following conditions:
47  //! 1) dividend%ret==0
48  //! 2) ret<=maxDivisor
49  template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
50  ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
51  {
52  core::assertValueUnsigned(dividend);
53  core::assertValueUnsigned(maxDivisor);
54  ALPAKA_ASSERT(dividend >= maxDivisor);
55 
56  T divisor = maxDivisor;
57  while(dividend % divisor != 0)
58  --divisor;
59  return divisor;
60  }
61 
62  //! \param val The value to find divisors of.
63  //! \param maxDivisor The maximum.
64  //! \return A list of all divisors less then or equal to the given maximum.
65  template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
66  ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
67  {
68  std::set<T> divisorSet;
69 
71  core::assertValueUnsigned(maxDivisor);
72  ALPAKA_ASSERT(maxDivisor <= val);
73 
74  for(T i(1); i <= std::min(val, maxDivisor); ++i)
75  {
76  if(val % i == 0)
77  {
78  divisorSet.insert(static_cast<T>(val / i));
79  }
80  }
81 
82  return divisorSet;
83  }
84  } // namespace detail
85 
86  //! \tparam TDim The dimensionality of the accelerator device properties.
87  //! \tparam TIdx The idx type of the accelerator device properties.
88  //! \param accDevProps The maxima for the work division.
89  //! \return If the accelerator device properties are valid.
90  template<typename TDim, typename TIdx>
92  {
93  // Check that the maximum counts are greater or equal 1.
94  if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
95  || (accDevProps.m_threadElemCountMax < 1))
96  {
97  return false;
98  }
99 
100  // Store the maxima allowed for extents of grid, blocks and threads.
101  auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
102  auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
103  auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
104 
105  // Check that the extents for all dimensions are correct.
106  for(typename TDim::value_type i(0); i < TDim::value; ++i)
107  {
108  // Check that the maximum extents are greater or equal 1.
109  if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
110  {
111  return false;
112  }
113  }
114 
115  return true;
116  }
117 
118  //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
119  //! 1. The the maxima block, thread and element extent and counts
120  //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
121  //! 3. The requirement of the block extent.
122  //!
123  //! \param gridElemExtent The full extent of elements in the grid.
124  //! \param threadElemExtent the number of elements computed per thread.
125  //! \param accDevProps The maxima for the work division.
126  //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
127  //! used, device hard limits are used.
128  //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
129  //! corresponding block thread extent.
130  //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
131  //! thread extent will be one in this dimension.
132  //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
133  template<typename TDim, typename TIdx>
135  Vec<TDim, TIdx> const& gridElemExtent,
136  Vec<TDim, TIdx> const& threadElemExtent,
137  AccDevProps<TDim, TIdx> const& accDevProps,
138  TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
139  bool blockThreadMustDivideGridThreadExtent = true,
140  GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
142  {
143  using Vec = Vec<TDim, TIdx>;
144  using DimLoopInd = typename TDim::value_type;
145 
146  for(DimLoopInd i(0); i < TDim::value; ++i)
147  {
148  ALPAKA_ASSERT(gridElemExtent[i] >= 1);
149  ALPAKA_ASSERT(threadElemExtent[i] >= 1);
150  ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
151  }
152  ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
153  ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
154 
155  // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
156  // optimized.
157  auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
158  auto const gridThreadExtent = [&]
159  {
160  Vec r;
161  for(DimLoopInd i(0u); i < TDim::value; ++i)
162  r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
163  return r;
164  }();
165 
166  ///////////////////////////////////////////////////////////////////
167  // Try to calculate an optimal blockThreadExtent.
168 
169  // Restrict the max block thread extent from the maximum possible to the grid thread extent.
170  // This removes dimensions not required in the grid thread extent.
171  // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
172  auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
173 
174  // For equal block thread extent, restrict it to its minimum component.
175  // For example (512, 256, 1024) will get (256, 256, 256).
176  if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
177  blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
178 
179  // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
180  TIdx const& blockThreadCountMax
181  = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
182 
183  // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
184  // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
185  // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
186  // enough.
187  for(typename TDim::value_type i(0); i < TDim::value; ++i)
188  {
189  blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
190  }
191 
192  // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
193  if(blockThreadCountMax == 1)
194  {
195  blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
196  }
197  else if(blockThreadExtent.prod() > blockThreadCountMax)
198  {
199  switch(gridBlockExtentSubDivRestrictions)
200  {
202  blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
203  break;
205  // Very primitive clipping. Just halve the largest value until it fits.
206  while(blockThreadExtent.prod() > blockThreadCountMax)
207  blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
208  break;
210  // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
211  while(blockThreadExtent.prod() > blockThreadCountMax)
212  {
213  auto const it = std::min_element(
214  blockThreadExtent.begin(),
215  blockThreadExtent.end() - 1, //! \todo why omit the last element?
216  [](TIdx const& a, TIdx const& b)
217  {
218  if(a == TIdx{1})
219  return false;
220  if(b == TIdx{1})
221  return true;
222  return a < b;
223  });
224  *it /= TIdx{2};
225  }
226  break;
227  }
228  }
229 
230 
231  // Make the block thread extent divide the grid thread extent.
232  if(blockThreadMustDivideGridThreadExtent)
233  {
234  switch(gridBlockExtentSubDivRestrictions)
235  {
237  {
238  // For equal size block extent we have to compute the gcd of all grid thread extent that is less
239  // then the current maximal block thread extent. For this we compute the divisors of all grid
240  // thread extent less then the current maximal block thread extent.
241  std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
242  for(DimLoopInd i(0u); i < TDim::value; ++i)
243  {
244  gridThreadExtentDivisors[i]
245  = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
246  }
247  // The maximal common divisor of all block thread extent is the optimal solution.
248  std::set<TIdx> intersects[2u];
249  for(DimLoopInd i(1u); i < TDim::value; ++i)
250  {
251  intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
252  intersects[(i) % 2u].clear();
253  set_intersection(
254  std::begin(intersects[(i - 1u) % 2u]),
255  std::end(intersects[(i - 1u) % 2u]),
256  std::begin(gridThreadExtentDivisors[i]),
257  std::end(gridThreadExtentDivisors[i]),
258  std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
259  }
260  TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
261  blockThreadExtent = Vec::all(maxCommonDivisor);
262  break;
263  }
265  [[fallthrough]];
267  for(DimLoopInd i(0u); i < TDim::value; ++i)
268  {
269  blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
270  }
271  break;
272  }
273  }
274 
275  // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
276  auto gridBlockExtent = [&]
277  {
278  Vec r;
279  for(DimLoopInd i = 0; i < TDim::value; ++i)
280  r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
281  return r;
282  }();
283 
284 
285  // Store the maxima allowed for extents of grid, blocks and threads.
286  auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
287  auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
288  auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
289 
290  // Check that the extents for all dimensions are correct.
291  for(typename TDim::value_type i(0); i < TDim::value; ++i)
292  {
293  // Check that the maximum extents are greater or equal 1.
294  if(gridBlockExtentMax[i] < gridBlockExtent[i])
295  {
296  gridBlockExtent[i] = gridBlockExtentMax[i];
297  }
298  if(blockThreadExtentMax[i] < blockThreadExtent[i])
299  {
300  blockThreadExtent[i] = blockThreadExtentMax[i];
301  }
302  if(threadElemExtentMax[i] < threadElemExtent[i])
303  {
304  clippedThreadElemExtent[i] = threadElemExtentMax[i];
305  }
306  }
307 
308  return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
309  }
310 
311  //! Kernel start configuration to determine a valid work division
312  //!
313  //! \tparam TGridElemExtent The type of the grid element extent.
314  //! \tparam TThreadElemExtent The type of the thread element extent.
315  template<
316  typename TAcc,
317  typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
318  typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
319  struct KernelCfg
320  {
321  //! The full extent of elements in the grid.
322  TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
323  //! The number of elements computed per thread.
324  TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
325  //! If this is true, the grid thread extent will be multiples of
326  //! the corresponding block thread extent.
327  //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
328  //! thread extent will be one in this dimension.
329  bool blockThreadMustDivideGridThreadExtent = true;
330  //! The grid block extent subdivision restrictions.
331  GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
332  = GridBlockExtentSubDivRestrictions::Unrestricted;
333 
334  static_assert(
336  "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
337  static_assert(
339  "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
340  static_assert(
341  std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
342  "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
343  static_assert(
344  std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
345  "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
346  };
347 
348  //! \tparam TDev The type of the device.
349  //! \tparam TGridElemExtent The type of the grid element extent.
350  //! \tparam TThreadElemExtent The type of the thread element extent.
351  //! \param dev The device the work division should be valid for.
352  //! \param kernelFnObj The kernel function object which should be executed.
353  //! \param args The kernel invocation arguments.
354  //! \return The work division for the accelerator based on the kernel and argument types
355  template<
356  typename TAcc,
357  typename TDev,
358  typename TGridElemExtent,
359  typename TThreadElemExtent,
360  typename TKernelFnObj,
361  typename... TArgs>
364  [[maybe_unused]] TDev const& dev,
365  TKernelFnObj const& kernelFnObj,
366  TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
367  {
368  using Acc = TAcc;
369 
370  // Get max number of threads per block depending on the kernel function attributes.
371  // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
372  // determines the max number of threads per block. This number could be equal or less than the max number of
373  // threads per block defined by device properties.
374  auto const kernelFunctionAttributes
375  = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
376  auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
377 
378  if constexpr(Dim<TGridElemExtent>::value == 0)
379  {
380  auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
381  ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
382  ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
383  return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
384  }
385  else
386  return subDivideGridElems(
387  getExtents(kernelCfg.gridElemExtent),
388  getExtents(kernelCfg.threadElemExtent),
389  getAccDevProps<Acc>(dev),
390  static_cast<Idx<Acc>>(threadsPerBlock),
391  kernelCfg.blockThreadMustDivideGridThreadExtent,
392  kernelCfg.gridBlockExtentSubDivRestrictions);
393 
394  using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
396  }
397 
398  //! Checks if the work division is supported
399  //!
400  //! \tparam TWorkDiv The type of the work division.
401  //! \tparam TDim The dimensionality of the accelerator device properties.
402  //! \tparam TIdx The idx type of the accelerator device properties.
403  //! \param workDiv The work division to test for validity.
404  //! \param accDevProps The maxima for the work division.
405  //! \return If the work division is valid for the given accelerator device properties.
406  template<typename TWorkDiv, typename TDim, typename TIdx>
407  ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
408  {
409  // Get the extents of grid, blocks and threads of the work division to check.
410  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
411  auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
412  auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
413 
414  // Check that the maximal counts are satisfied.
415  if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
416  {
417  return false;
418  }
419  if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
420  {
421  return false;
422  }
423  if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
424  {
425  return false;
426  }
427 
428  // Check that the extents for all dimensions are correct.
429  if constexpr(Dim<TWorkDiv>::value > 0)
430  {
431  // Store the maxima allowed for extents of grid, blocks and threads.
432  auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
433  auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
434  auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
435 
436  for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
437  {
438  // No extent is allowed to be zero or greater then the allowed maximum.
439  if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
440  || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
441  || (threadElemExtentMax[i] < threadElemExtent[i]))
442  {
443  return false;
444  }
445  }
446  }
447 
448  return true;
449  }
450 
451  //! Checks if the work division is supported
452  //!
453  //! \tparam TWorkDiv The type of the work division.
454  //! \tparam TDim The dimensionality of the accelerator device properties.
455  //! \tparam TIdx The idx type of the accelerator device properties.
456  //! \param workDiv The work division to test for validity.
457  //! \param accDevProps The maxima for the work division.
458  //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
459  //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
460  //! threads per block supported by the device.
461  //! \return Returns true if the work division is valid for the given accelerator device properties and for the
462  //! given kernel. Otherwise returns false.
463  template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
465  TWorkDiv const& workDiv,
466  AccDevProps<TDim, TIdx> const& accDevProps,
467  KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
468  {
469  // Get the extents of grid, blocks and threads of the work division to check.
470  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
471  auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
472  auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
473  // Use kernel properties to find the max threads per block for the kernel
474  auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
475  // Select the minimum to find the upper bound for the threads per block
476  auto const allowedThreadsPerBlock = std::min(
477  static_cast<TIdx>(threadsPerBlockForKernel),
478  static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
479  // Check that the maximal counts are satisfied.
480  if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
481  {
482  return false;
483  }
484  if(allowedThreadsPerBlock < blockThreadExtent.prod())
485  {
486  return false;
487  }
488  if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
489  {
490  return false;
491  }
492 
493  // Check that the extents for all dimensions are correct.
494  if constexpr(Dim<TWorkDiv>::value > 0)
495  {
496  // Store the maxima allowed for extents of grid, blocks and threads.
497  auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
498  auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
499  auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
500 
501  for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
502  {
503  // No extent is allowed to be zero or greater then the allowed maximum.
504  if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
505  || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
506  || (threadElemExtentMax[i] < threadElemExtent[i]))
507  {
508  return false;
509  }
510  }
511  }
512 
513  return true;
514  }
515 
516  //! Checks if the work division is supported for the kernel on the device
517  //!
518  //! \tparam TAcc The accelerator to test the validity on.
519  //! \tparam TDev The type of the device.
520  //! \tparam TWorkDiv The type of work division to test for validity.
521  //! \param workDiv The work division to test for validity.
522  //! \param dev The device to test the work division for validity on.
523  //! \param kernelFnObj The kernel function object which should be executed.
524  //! \param args The kernel invocation arguments.
525  //! \return Returns the value of isValidWorkDiv function.
526  template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
528  TWorkDiv const& workDiv,
529  TDev const& dev,
530  TKernelFnObj const& kernelFnObj,
531  TArgs&&... args) -> bool
532  {
533  return isValidWorkDiv<TAcc>(
534  workDiv,
535  getAccDevProps<TAcc>(dev),
536  getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
537  }
538 
539  //! Checks if the work division is supported by the device
540  //!
541  //! \tparam TAcc The accelerator to test the validity on.
542  //! \param workDiv The work division to test for validity.
543  //! \param dev The device to test the work division for validity on.
544  //! \return If the work division is valid on this accelerator.
545  template<typename TAcc, typename TWorkDiv, typename TDev>
546  ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
547  {
548  return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
549  }
550 } // namespace alpaka
551 
552 #if BOOST_COMP_CLANG
553 # pragma clang diagnostic pop
554 #endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition: Assert.hpp:13
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
Definition: Unreachable.hpp:24
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition: Vec.hpp:116
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
constexpr ALPAKA_FN_HOST_ACC auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition: Utility.hpp:27
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
Definition: Assert.hpp:77
constexpr ALPAKA_FN_HOST_ACC auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
Definition: Utility.hpp:46
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const &dividend, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition: Traits.hpp:1280
ALPAKA_FN_HOST auto end(TView &view) -> Iterator< TView >
Definition: Iterator.hpp:139
ALPAKA_FN_HOST auto begin(TView &view) -> Iterator< TView >
Definition: Iterator.hpp:133
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition: Traits.hpp:29
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, TDev const &dev) -> bool
Checks if the work division is supported by the device.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T >>
Definition: Traits.hpp:59
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, [[maybe_unused]] TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc >>
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition: Vec.hpp:634
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex >>
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
Definition: Traits.hpp:78
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
Definition: AccDevProps.hpp:18
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Definition: AccDevProps.hpp:26
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
Definition: AccDevProps.hpp:25
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Definition: AccDevProps.hpp:30
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Definition: AccDevProps.hpp:27
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
Definition: AccDevProps.hpp:29
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Definition: AccDevProps.hpp:28
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...