alpaka
Abstraction Library for Parallel Kernel Acceleration
WorkDivHelpers.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/acc/Traits.hpp"
8 #include "alpaka/core/Assert.hpp"
9 #include "alpaka/core/Common.hpp"
10 #include "alpaka/core/Utility.hpp"
11 #include "alpaka/dev/Traits.hpp"
12 #include "alpaka/extent/Traits.hpp"
13 #include "alpaka/vec/Vec.hpp"
15 
16 #include <algorithm>
17 #include <array>
18 #include <cmath>
19 #include <functional>
20 #include <set>
21 #include <type_traits>
22 
23 //! The alpaka library.
24 namespace alpaka
25 {
26  //! The grid block extent subdivision restrictions.
28  {
29  EqualExtent, //!< The block thread extent will be equal in all dimensions.
30  CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
31  Unrestricted, //!< The block thread extent will not have any restrictions.
32  };
33 
34  namespace detail
35  {
36  //! Finds the largest divisor where divident % divisor == 0
37  //! \param dividend The dividend.
38  //! \param maxDivisor The maximum divisor.
39  //! \return The biggest number that satisfies the following conditions:
40  //! 1) dividend%ret==0
41  //! 2) ret<=maxDivisor
42  template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
43  ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
44  {
45  core::assertValueUnsigned(dividend);
46  core::assertValueUnsigned(maxDivisor);
47  ALPAKA_ASSERT(dividend >= maxDivisor);
48 
49  T divisor = maxDivisor;
50  while(dividend % divisor != 0)
51  --divisor;
52  return divisor;
53  }
54 
55  //! \param val The value to find divisors of.
56  //! \param maxDivisor The maximum.
57  //! \return A list of all divisors less then or equal to the given maximum.
58  template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
59  ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
60  {
61  std::set<T> divisorSet;
62 
64  core::assertValueUnsigned(maxDivisor);
65  ALPAKA_ASSERT(maxDivisor <= val);
66 
67  for(T i(1); i <= std::min(val, maxDivisor); ++i)
68  {
69  if(val % i == 0)
70  {
71  divisorSet.insert(static_cast<T>(val / i));
72  }
73  }
74 
75  return divisorSet;
76  }
77  } // namespace detail
78 
79  //! \tparam TDim The dimensionality of the accelerator device properties.
80  //! \tparam TIdx The idx type of the accelerator device properties.
81  //! \param accDevProps The maxima for the work division.
82  //! \return If the accelerator device properties are valid.
83  template<typename TDim, typename TIdx>
85  {
86  // Check that the maximum counts are greater or equal 1.
87  if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
88  || (accDevProps.m_threadElemCountMax < 1))
89  {
90  return false;
91  }
92 
93  // Store the maxima allowed for extents of grid, blocks and threads.
94  auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
95  auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
96  auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
97 
98  // Check that the extents for all dimensions are correct.
99  for(typename TDim::value_type i(0); i < TDim::value; ++i)
100  {
101  // Check that the maximum extents are greater or equal 1.
102  if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
103  {
104  return false;
105  }
106  }
107 
108  return true;
109  }
110 
111  //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
112  //! 1. The the maxima block, thread and element extent and counts
113  //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
114  //! 3. The requirement of the block extent.
115  //!
116  //! \param gridElemExtent
117  //! The full extent of elements in the grid.
118  //! \param threadElemExtent
119  //! the number of elements computed per thread.
120  //! \param accDevProps
121  //! The maxima for the work division.
122  //! \param blockThreadMustDivideGridThreadExtent
123  //! If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
124  //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
125  //! thread extent will be one in this dimension.
126  //! \param gridBlockExtentSubDivRestrictions
127  //! The grid block extent subdivision restrictions.
128  template<typename TDim, typename TIdx>
130  Vec<TDim, TIdx> const& gridElemExtent,
131  Vec<TDim, TIdx> const& threadElemExtent,
132  AccDevProps<TDim, TIdx> const& accDevProps,
133  bool blockThreadMustDivideGridThreadExtent = true,
134  GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
136  {
137  using Vec = Vec<TDim, TIdx>;
138  using DimLoopInd = typename TDim::value_type;
139 
140  for(DimLoopInd i(0); i < TDim::value; ++i)
141  {
142  ALPAKA_ASSERT(gridElemExtent[i] >= 1);
143  ALPAKA_ASSERT(threadElemExtent[i] >= 1);
144  ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
145  }
146  ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
147  ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
148 
149  // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
150  // optimized.
151  auto const clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
152  auto const gridThreadExtent = [&]
153  {
154  Vec r;
155  for(DimLoopInd i(0u); i < TDim::value; ++i)
156  r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
157  return r;
158  }();
159 
160  ///////////////////////////////////////////////////////////////////
161  // Try to calculate an optimal blockThreadExtent.
162 
163  // Restrict the max block thread extent from the maximum possible to the grid thread extent.
164  // This removes dimensions not required in the grid thread extent.
165  // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
166  auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
167 
168  // For equal block thread extent, restrict it to its minimum component.
169  // For example (512, 256, 1024) will get (256, 256, 256).
170  if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
171  blockThreadExtent = Vec::all(blockThreadExtent.min());
172 
173  // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
174  auto const& blockThreadCountMax = accDevProps.m_blockThreadCountMax;
175  if(blockThreadExtent.prod() > blockThreadCountMax)
176  {
177  switch(gridBlockExtentSubDivRestrictions)
178  {
180  blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
181  break;
183  // Very primitive clipping. Just halve the largest value until it fits.
184  while(blockThreadExtent.prod() > blockThreadCountMax)
185  blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
186  break;
188  // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
189  while(blockThreadExtent.prod() > blockThreadCountMax)
190  {
191  auto const it = std::min_element(
192  blockThreadExtent.begin(),
193  blockThreadExtent.end() - 1, //! \todo why omit the last element?
194  [](TIdx const& a, TIdx const& b)
195  {
196  if(a == TIdx{1})
197  return false;
198  if(b == TIdx{1})
199  return true;
200  return a < b;
201  });
202  *it /= TIdx{2};
203  }
204  break;
205  }
206  }
207 
208  // Make the block thread extent divide the grid thread extent.
209  if(blockThreadMustDivideGridThreadExtent)
210  {
211  switch(gridBlockExtentSubDivRestrictions)
212  {
214  {
215  // For equal size block extent we have to compute the gcd of all grid thread extent that is less
216  // then the current maximal block thread extent. For this we compute the divisors of all grid
217  // thread extent less then the current maximal block thread extent.
218  std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
219  for(DimLoopInd i(0u); i < TDim::value; ++i)
220  {
221  gridThreadExtentDivisors[i]
222  = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
223  }
224  // The maximal common divisor of all block thread extent is the optimal solution.
225  std::set<TIdx> intersects[2u];
226  for(DimLoopInd i(1u); i < TDim::value; ++i)
227  {
228  intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
229  intersects[(i) % 2u].clear();
230  set_intersection(
231  std::begin(intersects[(i - 1u) % 2u]),
232  std::end(intersects[(i - 1u) % 2u]),
233  std::begin(gridThreadExtentDivisors[i]),
234  std::end(gridThreadExtentDivisors[i]),
235  std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
236  }
237  TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
238  blockThreadExtent = Vec::all(maxCommonDivisor);
239  break;
240  }
242  [[fallthrough]];
244  for(DimLoopInd i(0u); i < TDim::value; ++i)
245  blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
246  break;
247  }
248  }
249 
250  // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
251  auto const gridBlockExtent = [&]
252  {
253  Vec r;
254  for(DimLoopInd i = 0; i < TDim::value; ++i)
255  r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
256  return r;
257  }();
258 
259  return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
260  }
261 
262  //! \tparam TAcc The accelerator for which this work division has to be valid.
263  //! \tparam TGridElemExtent The type of the grid element extent.
264  //! \tparam TThreadElemExtent The type of the thread element extent.
265  //! \tparam TDev The type of the device.
266  //! \param dev
267  //! The device the work division should be valid for.
268  //! \param gridElemExtent
269  //! The full extent of elements in the grid.
270  //! \param threadElemExtents
271  //! the number of elements computed per thread.
272  //! \param blockThreadMustDivideGridThreadExtent
273  //! If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
274  //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
275  //! thread extent will be one in this dimension.
276  //! \param gridBlockExtentSubDivRestrictions
277  //! The grid block extent subdivision restrictions.
278  //! \return The work division.
279  template<
280  typename TAcc,
281  typename TDev,
282  typename TGridElemExtent = Vec<Dim<TAcc>, Idx<TAcc>>,
283  typename TThreadElemExtent = Vec<Dim<TAcc>, Idx<TAcc>>>
285  [[maybe_unused]] TDev const& dev,
286  [[maybe_unused]] TGridElemExtent const& gridElemExtent = Vec<Dim<TAcc>, Idx<TAcc>>::ones(),
287  [[maybe_unused]] TThreadElemExtent const& threadElemExtents = Vec<Dim<TAcc>, Idx<TAcc>>::ones(),
288  [[maybe_unused]] bool blockThreadMustDivideGridThreadExtent = true,
289  [[maybe_unused]] GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
290  = GridBlockExtentSubDivRestrictions::Unrestricted)
292  {
293  static_assert(
295  "The dimension of TAcc and the dimension of TGridElemExtent have to be identical!");
296  static_assert(
298  "The dimension of TAcc and the dimension of TThreadElemExtent have to be identical!");
299  static_assert(
300  std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
301  "The idx type of TAcc and the idx type of TGridElemExtent have to be identical!");
302  static_assert(
303  std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
304  "The idx type of TAcc and the idx type of TThreadElemExtent have to be identical!");
305 
306  if constexpr(Dim<TGridElemExtent>::value == 0)
307  {
308  auto const zero = Vec<DimInt<0>, Idx<TAcc>>{};
309  ALPAKA_ASSERT(gridElemExtent == zero);
310  ALPAKA_ASSERT(threadElemExtents == zero);
311  return WorkDivMembers<DimInt<0>, Idx<TAcc>>{zero, zero, zero};
312  }
313  else
314  return subDivideGridElems(
315  getExtents(gridElemExtent),
316  getExtents(threadElemExtents),
317  getAccDevProps<TAcc>(dev),
318  blockThreadMustDivideGridThreadExtent,
319  gridBlockExtentSubDivRestrictions);
320  using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
322  }
323 
324  //! \tparam TDim The dimensionality of the accelerator device properties.
325  //! \tparam TIdx The idx type of the accelerator device properties.
326  //! \tparam TWorkDiv The type of the work division.
327  //! \param accDevProps The maxima for the work division.
328  //! \param workDiv The work division to test for validity.
329  //! \return If the work division is valid for the given accelerator device properties.
330  template<typename TDim, typename TIdx, typename TWorkDiv>
331  ALPAKA_FN_HOST auto isValidWorkDiv(AccDevProps<TDim, TIdx> const& accDevProps, TWorkDiv const& workDiv) -> bool
332  {
333  // Get the extents of grid, blocks and threads of the work division to check.
334  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
335  auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
336  auto const threadElemExtent = getWorkDiv<Block, Threads>(workDiv);
337 
338  // Check that the maximal counts are satisfied.
339  if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
340  {
341  return false;
342  }
343  if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
344  {
345  return false;
346  }
347  if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
348  {
349  return false;
350  }
351 
352  // Check that the extents for all dimensions are correct.
353  if constexpr(Dim<TWorkDiv>::value > 0)
354  {
355  // Store the maxima allowed for extents of grid, blocks and threads.
356  auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
357  auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
358  auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
359 
360  for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
361  {
362  // No extent is allowed to be zero or greater then the allowed maximum.
363  if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
364  || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
365  || (threadElemExtentMax[i] < threadElemExtent[i]))
366  {
367  return false;
368  }
369  }
370  }
371 
372  return true;
373  }
374 
375  //! \tparam TAcc The accelerator to test the validity on.
376  //! \param dev The device to test the work division for validity on.
377  //! \param workDiv The work division to test for validity.
378  //! \return If the work division is valid on this accelerator.
379  template<typename TAcc, typename TDev, typename TWorkDiv>
380  ALPAKA_FN_HOST auto isValidWorkDiv(TDev const& dev, TWorkDiv const& workDiv) -> bool
381  {
382  return isValidWorkDiv(getAccDevProps<TAcc>(dev), workDiv);
383  }
384 } // namespace alpaka
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition: Assert.hpp:13
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
Definition: Unreachable.hpp:24
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition: Vec.hpp:116
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
constexpr ALPAKA_FN_HOST_ACC auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition: Utility.hpp:27
constexpr ALPAKA_FN_HOST_ACC auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
Definition: Utility.hpp:46
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
Definition: Assert.hpp:77
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const &dividend, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition: Traits.hpp:1280
ALPAKA_FN_HOST auto end(TView &view) -> Iterator< TView >
Definition: Iterator.hpp:139
ALPAKA_FN_HOST auto begin(TView &view) -> Iterator< TView >
Definition: Iterator.hpp:133
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition: Traits.hpp:29
ALPAKA_FN_HOST auto getValidWorkDiv([[maybe_unused]] TDev const &dev, [[maybe_unused]] TGridElemExtent const &gridElemExtent=Vec< Dim< TAcc >, Idx< TAcc >>::ones(), [[maybe_unused]] TThreadElemExtent const &threadElemExtents=Vec< Dim< TAcc >, Idx< TAcc >>::ones(), [[maybe_unused]] bool blockThreadMustDivideGridThreadExtent=true, [[maybe_unused]] GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< Dim< TGridElemExtent >, Idx< TGridElemExtent >>
ALPAKA_FN_HOST auto isValidWorkDiv(TDev const &dev, TWorkDiv const &workDiv) -> bool
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T >>
Definition: Traits.hpp:59
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition: Vec.hpp:627
Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex >>
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
The acceleration properties on a device.
Definition: AccDevProps.hpp:18