alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WorkDivHelpers.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/dev/Traits.hpp"
15#include "alpaka/vec/Vec.hpp"
17
18#include <algorithm>
19#include <array>
20#include <cmath>
21#include <functional>
22#include <iterator>
23#include <set>
24#include <type_traits>
25
26#if ALPAKA_COMP_CLANG
27# pragma clang diagnostic push
28# pragma clang diagnostic ignored "-Wswitch-default"
29#endif
30
31//! The alpaka library.
32namespace alpaka
33{
34 //! The grid block extent subdivision restrictions.
36 {
37 EqualExtent, //!< The block thread extent will be equal in all dimensions.
38 CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
39 Unrestricted, //!< The block thread extent will not have any restrictions.
40 };
41
42 namespace detail
43 {
44 //! Finds the largest divisor where divident % divisor == 0
45 //! \param dividend The dividend.
46 //! \param maxDivisor The maximum divisor.
47 //! \return The biggest number that satisfies the following conditions:
48 //! 1) dividend%ret==0
49 //! 2) ret<=maxDivisor
50 template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
51 ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
52 {
54 core::assertValueUnsigned(maxDivisor);
55 ALPAKA_ASSERT(dividend >= maxDivisor);
56
57 T divisor = maxDivisor;
58 while(dividend % divisor != 0)
59 --divisor;
60 return divisor;
61 }
62
63 //! \param val The value to find divisors of.
64 //! \param maxDivisor The maximum.
65 //! \return A list of all divisors less then or equal to the given maximum.
66 template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
67 ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
68 {
69 std::set<T> divisorSet;
70
72 core::assertValueUnsigned(maxDivisor);
73 ALPAKA_ASSERT(maxDivisor <= val);
74
75 for(T i(1); i <= std::min(val, maxDivisor); ++i)
76 {
77 if(val % i == 0)
78 {
79 divisorSet.insert(static_cast<T>(val / i));
80 }
81 }
82
83 return divisorSet;
84 }
85 } // namespace detail
86
87 //! \tparam TDim The dimensionality of the accelerator device properties.
88 //! \tparam TIdx The idx type of the accelerator device properties.
89 //! \param accDevProps The maxima for the work division.
90 //! \return If the accelerator device properties are valid.
91 template<typename TDim, typename TIdx>
93 {
94 // Check that the maximum counts are greater or equal 1.
95 if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
96 || (accDevProps.m_threadElemCountMax < 1))
97 {
98 return false;
99 }
100
101 // Store the maxima allowed for extents of grid, blocks and threads.
102 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
103 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
104 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
105
106 // Check that the extents for all dimensions are correct.
107 for(typename TDim::value_type i(0); i < TDim::value; ++i)
108 {
109 // Check that the maximum extents are greater or equal 1.
110 if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
111 {
112 return false;
113 }
114 }
115
116 return true;
117 }
118
119 //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
120 //! 1. The the maxima block, thread and element extent and counts
121 //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
122 //! 3. The requirement of the block extent.
123 //!
124 //! \param gridElemExtent The full extent of elements in the grid.
125 //! \param threadElemExtent the number of elements computed per thread.
126 //! \param accDevProps The maxima for the work division.
127 //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
128 //! used, device hard limits are used.
129 //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
130 //! corresponding block thread extent.
131 //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
132 //! thread extent will be one in this dimension.
133 //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
134 template<typename TDim, typename TIdx>
136 Vec<TDim, TIdx> const& gridElemExtent,
137 Vec<TDim, TIdx> const& threadElemExtent,
138 AccDevProps<TDim, TIdx> const& accDevProps,
139 TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
140 bool blockThreadMustDivideGridThreadExtent = true,
141 GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
143 {
144 using Vec = Vec<TDim, TIdx>;
145 using DimLoopInd = typename TDim::value_type;
146
147 for(DimLoopInd i(0); i < TDim::value; ++i)
148 {
149 ALPAKA_ASSERT(gridElemExtent[i] >= 1);
150 ALPAKA_ASSERT(threadElemExtent[i] >= 1);
151 ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
152 }
153 ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
154 ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
155
156 // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
157 // optimized.
158 auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
159 auto const gridThreadExtent = [&]
160 {
161 Vec r;
162 for(DimLoopInd i(0u); i < TDim::value; ++i)
163 r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
164 return r;
165 }();
166
167 ///////////////////////////////////////////////////////////////////
168 // Try to calculate an optimal blockThreadExtent.
169
170 // Restrict the max block thread extent from the maximum possible to the grid thread extent.
171 // This removes dimensions not required in the grid thread extent.
172 // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
173 auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
174
175 // For equal block thread extent, restrict it to its minimum component.
176 // For example (512, 256, 1024) will get (256, 256, 256).
177 if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
178 blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
179
180 // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
181 TIdx const& blockThreadCountMax
182 = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
183
184 // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
185 // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
186 // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
187 // enough.
188 for(typename TDim::value_type i(0); i < TDim::value; ++i)
189 {
190 blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
191 }
192
193 // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
194 if(blockThreadCountMax == 1)
195 {
196 blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
197 }
198 else if(blockThreadExtent.prod() > blockThreadCountMax)
199 {
200 switch(gridBlockExtentSubDivRestrictions)
201 {
203 blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
204 break;
206 // Very primitive clipping. Just halve the largest value until it fits.
207 while(blockThreadExtent.prod() > blockThreadCountMax)
208 blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
209 break;
211 // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
212 while(blockThreadExtent.prod() > blockThreadCountMax)
213 {
214 auto const it = std::min_element(
215 blockThreadExtent.begin(),
216 blockThreadExtent.end() - 1, //! \todo why omit the last element?
217 [](TIdx const& a, TIdx const& b)
218 {
219 if(a == TIdx{1})
220 return false;
221 if(b == TIdx{1})
222 return true;
223 return a < b;
224 });
225 *it /= TIdx{2};
226 }
227 break;
228 }
229 }
230
231
232 // Make the block thread extent divide the grid thread extent.
233 if(blockThreadMustDivideGridThreadExtent)
234 {
235 switch(gridBlockExtentSubDivRestrictions)
236 {
238 {
239 // For equal size block extent we have to compute the gcd of all grid thread extent that is less
240 // then the current maximal block thread extent. For this we compute the divisors of all grid
241 // thread extent less then the current maximal block thread extent.
242 std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
243 for(DimLoopInd i(0u); i < TDim::value; ++i)
244 {
245 gridThreadExtentDivisors[i]
246 = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
247 }
248 // The maximal common divisor of all block thread extent is the optimal solution.
249 std::set<TIdx> intersects[2u];
250 for(DimLoopInd i(1u); i < TDim::value; ++i)
251 {
252 intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
253 intersects[(i) % 2u].clear();
254 set_intersection(
255 std::begin(intersects[(i - 1u) % 2u]),
256 std::end(intersects[(i - 1u) % 2u]),
257 std::begin(gridThreadExtentDivisors[i]),
258 std::end(gridThreadExtentDivisors[i]),
259 std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
260 }
261 TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
262 blockThreadExtent = Vec::all(maxCommonDivisor);
263 break;
264 }
266 [[fallthrough]];
268 for(DimLoopInd i(0u); i < TDim::value; ++i)
269 {
270 blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
271 }
272 break;
273 }
274 }
275
276 // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
277 auto gridBlockExtent = [&]
278 {
279 Vec r;
280 for(DimLoopInd i = 0; i < TDim::value; ++i)
281 r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
282 return r;
283 }();
284
285
286 // Store the maxima allowed for extents of grid, blocks and threads.
287 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
288 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
289 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
290
291 // Check that the extents for all dimensions are correct.
292 for(typename TDim::value_type i(0); i < TDim::value; ++i)
293 {
294 // Check that the maximum extents are greater or equal 1.
295 if(gridBlockExtentMax[i] < gridBlockExtent[i])
296 {
297 gridBlockExtent[i] = gridBlockExtentMax[i];
298 }
299 if(blockThreadExtentMax[i] < blockThreadExtent[i])
300 {
301 blockThreadExtent[i] = blockThreadExtentMax[i];
302 }
303 if(threadElemExtentMax[i] < threadElemExtent[i])
304 {
305 clippedThreadElemExtent[i] = threadElemExtentMax[i];
306 }
307 }
308
309 return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
310 }
311
312 //! Kernel start configuration to determine a valid work division
313 //!
314 //! \tparam TGridElemExtent The type of the grid element extent.
315 //! \tparam TThreadElemExtent The type of the thread element extent.
316 template<
317 typename TAcc,
318 typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
319 typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
321 {
322 //! The full extent of elements in the grid.
323 TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
324 //! The number of elements computed per thread.
325 TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
326 //! If this is true, the grid thread extent will be multiples of
327 //! the corresponding block thread extent.
328 //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
329 //! thread extent will be one in this dimension.
330 bool blockThreadMustDivideGridThreadExtent = true;
331 //! The grid block extent subdivision restrictions.
332 GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
333 = GridBlockExtentSubDivRestrictions::Unrestricted;
334
335 static_assert(
337 "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
338 static_assert(
340 "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
341 static_assert(
342 std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
343 "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
344 static_assert(
345 std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
346 "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
347 };
348
349 //! \tparam TDev The type of the device.
350 //! \tparam TGridElemExtent The type of the grid element extent.
351 //! \tparam TThreadElemExtent The type of the thread element extent.
352 //! \param dev The device the work division should be valid for.
353 //! \param kernelFnObj The kernel function object which should be executed.
354 //! \param args The kernel invocation arguments.
355 //! \return The work division for the accelerator based on the kernel and argument types
356 template<
357 typename TAcc,
358 typename TDev,
359 typename TGridElemExtent,
360 typename TThreadElemExtent,
361 typename TKernelFnObj,
362 typename... TArgs>
365 [[maybe_unused]] TDev const& dev,
366 TKernelFnObj const& kernelFnObj,
367 TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
368 {
369 using Acc = TAcc;
370
371 // Get max number of threads per block depending on the kernel function attributes.
372 // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
373 // determines the max number of threads per block. This number could be equal or less than the max number of
374 // threads per block defined by device properties.
375 auto const kernelFunctionAttributes
376 = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
377 auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
378
379 if constexpr(Dim<TGridElemExtent>::value == 0)
380 {
381 auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
382 ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
383 ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
384 return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
385 }
386 else
387 return subDivideGridElems(
388 getExtents(kernelCfg.gridElemExtent),
389 getExtents(kernelCfg.threadElemExtent),
390 getAccDevProps<Acc>(dev),
391 static_cast<Idx<Acc>>(threadsPerBlock),
392 kernelCfg.blockThreadMustDivideGridThreadExtent,
393 kernelCfg.gridBlockExtentSubDivRestrictions);
394
395 using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
397 }
398
399 //! Checks if the work division is supported
400 //!
401 //! \tparam TWorkDiv The type of the work division.
402 //! \tparam TDim The dimensionality of the accelerator device properties.
403 //! \tparam TIdx The idx type of the accelerator device properties.
404 //! \param workDiv The work division to test for validity.
405 //! \param accDevProps The maxima for the work division.
406 //! \return If the work division is valid for the given accelerator device properties.
407 template<typename TWorkDiv, typename TDim, typename TIdx>
408 ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
409 {
410 // Get the extents of grid, blocks and threads of the work division to check.
411 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
412 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
413 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
414
415 // Check that the maximal counts are satisfied.
416 if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
417 {
418 return false;
419 }
420 if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
421 {
422 return false;
423 }
424 if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
425 {
426 return false;
427 }
428
429 // Check that the extents for all dimensions are correct.
430 if constexpr(Dim<TWorkDiv>::value > 0)
431 {
432 // Store the maxima allowed for extents of grid, blocks and threads.
433 auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
434 auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
435 auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
436
437 for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
438 {
439 // No extent is allowed to be zero or greater then the allowed maximum.
440 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
441 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
442 || (threadElemExtentMax[i] < threadElemExtent[i]))
443 {
444 return false;
445 }
446 }
447 }
448
449 return true;
450 }
451
452 //! Checks if the work division is supported
453 //!
454 //! \tparam TWorkDiv The type of the work division.
455 //! \tparam TDim The dimensionality of the accelerator device properties.
456 //! \tparam TIdx The idx type of the accelerator device properties.
457 //! \param workDiv The work division to test for validity.
458 //! \param accDevProps The maxima for the work division.
459 //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
460 //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
461 //! threads per block supported by the device.
462 //! \return Returns true if the work division is valid for the given accelerator device properties and for the
463 //! given kernel. Otherwise returns false.
464 template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
466 TWorkDiv const& workDiv,
467 AccDevProps<TDim, TIdx> const& accDevProps,
468 KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
469 {
470 // Get the extents of grid, blocks and threads of the work division to check.
471 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
472 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
473 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
474 // Use kernel properties to find the max threads per block for the kernel
475 auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
476 // Select the minimum to find the upper bound for the threads per block
477 auto const allowedThreadsPerBlock = std::min(
478 static_cast<TIdx>(threadsPerBlockForKernel),
479 static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
480 // Check that the maximal counts are satisfied.
481 if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
482 {
483 return false;
484 }
485 if(allowedThreadsPerBlock < blockThreadExtent.prod())
486 {
487 return false;
488 }
489 if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
490 {
491 return false;
492 }
493
494 // Check that the extents for all dimensions are correct.
495 if constexpr(Dim<TWorkDiv>::value > 0)
496 {
497 // Store the maxima allowed for extents of grid, blocks and threads.
498 auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
499 auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
500 auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
501
502 for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
503 {
504 // No extent is allowed to be zero or greater then the allowed maximum.
505 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
506 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
507 || (threadElemExtentMax[i] < threadElemExtent[i]))
508 {
509 return false;
510 }
511 }
512 }
513
514 return true;
515 }
516
517 //! Checks if the work division is supported for the kernel on the device
518 //!
519 //! \tparam TAcc The accelerator to test the validity on.
520 //! \tparam TDev The type of the device.
521 //! \tparam TWorkDiv The type of work division to test for validity.
522 //! \param workDiv The work division to test for validity.
523 //! \param dev The device to test the work division for validity on.
524 //! \param kernelFnObj The kernel function object which should be executed.
525 //! \param args The kernel invocation arguments.
526 //! \return Returns the value of isValidWorkDiv function.
527 template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
529 TWorkDiv const& workDiv,
530 TDev const& dev,
531 TKernelFnObj const& kernelFnObj,
532 TArgs&&... args) -> bool
533 {
534 return isValidWorkDiv<TAcc>(
535 workDiv,
536 getAccDevProps<TAcc>(dev),
537 getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
538 }
539
540 //! Checks if the work division is supported by the device
541 //!
542 //! \tparam TAcc The accelerator to test the validity on.
543 //! \param workDiv The work division to test for validity.
544 //! \param dev The device to test the work division for validity on.
545 //! \return If the work division is valid on this accelerator.
546 template<typename TAcc, typename TWorkDiv, typename TDev>
547 ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
548 {
549 return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
550 }
551} // namespace alpaka
552
553#if ALPAKA_COMP_CLANG
554# pragma clang diagnostic pop
555#endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition Assert.hpp:13
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition Vec.hpp:89
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:43
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
Definition Assert.hpp:77
ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
Definition Utility.hpp:46
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition Utility.hpp:27
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const &dividend, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc > >
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, AccDevProps< TDim, TIdx > const &accDevProps) -> bool
Checks if the work division is supported.
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T > >
Definition Traits.hpp:59
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition Vec.hpp:554
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
Definition Traits.hpp:83
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...