alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WorkDivHelpers.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/dev/Traits.hpp"
15#include "alpaka/vec/Vec.hpp"
17
18#include <algorithm>
19#include <array>
20#include <cmath>
21#include <functional>
22#include <set>
23#include <type_traits>
24
25#if BOOST_COMP_CLANG
26# pragma clang diagnostic push
27# pragma clang diagnostic ignored "-Wswitch-default"
28#endif
29
30//! The alpaka library.
31namespace alpaka
32{
33 //! The grid block extent subdivision restrictions.
35 {
36 EqualExtent, //!< The block thread extent will be equal in all dimensions.
37 CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
38 Unrestricted, //!< The block thread extent will not have any restrictions.
39 };
40
41 namespace detail
42 {
43 //! Finds the largest divisor where divident % divisor == 0
44 //! \param dividend The dividend.
45 //! \param maxDivisor The maximum divisor.
46 //! \return The biggest number that satisfies the following conditions:
47 //! 1) dividend%ret==0
48 //! 2) ret<=maxDivisor
49 template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
50 ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
51 {
53 core::assertValueUnsigned(maxDivisor);
54 ALPAKA_ASSERT(dividend >= maxDivisor);
55
56 T divisor = maxDivisor;
57 while(dividend % divisor != 0)
58 --divisor;
59 return divisor;
60 }
61
62 //! \param val The value to find divisors of.
63 //! \param maxDivisor The maximum.
64 //! \return A list of all divisors less then or equal to the given maximum.
65 template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
66 ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
67 {
68 std::set<T> divisorSet;
69
71 core::assertValueUnsigned(maxDivisor);
72 ALPAKA_ASSERT(maxDivisor <= val);
73
74 for(T i(1); i <= std::min(val, maxDivisor); ++i)
75 {
76 if(val % i == 0)
77 {
78 divisorSet.insert(static_cast<T>(val / i));
79 }
80 }
81
82 return divisorSet;
83 }
84 } // namespace detail
85
86 //! \tparam TDim The dimensionality of the accelerator device properties.
87 //! \tparam TIdx The idx type of the accelerator device properties.
88 //! \param accDevProps The maxima for the work division.
89 //! \return If the accelerator device properties are valid.
90 template<typename TDim, typename TIdx>
92 {
93 // Check that the maximum counts are greater or equal 1.
94 if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
95 || (accDevProps.m_threadElemCountMax < 1))
96 {
97 return false;
98 }
99
100 // Store the maxima allowed for extents of grid, blocks and threads.
101 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
102 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
103 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
104
105 // Check that the extents for all dimensions are correct.
106 for(typename TDim::value_type i(0); i < TDim::value; ++i)
107 {
108 // Check that the maximum extents are greater or equal 1.
109 if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
110 {
111 return false;
112 }
113 }
114
115 return true;
116 }
117
118 //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
119 //! 1. The the maxima block, thread and element extent and counts
120 //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
121 //! 3. The requirement of the block extent.
122 //!
123 //! \param gridElemExtent The full extent of elements in the grid.
124 //! \param threadElemExtent the number of elements computed per thread.
125 //! \param accDevProps The maxima for the work division.
126 //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
127 //! used, device hard limits are used.
128 //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
129 //! corresponding block thread extent.
130 //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
131 //! thread extent will be one in this dimension.
132 //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
133 template<typename TDim, typename TIdx>
135 Vec<TDim, TIdx> const& gridElemExtent,
136 Vec<TDim, TIdx> const& threadElemExtent,
137 AccDevProps<TDim, TIdx> const& accDevProps,
138 TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
139 bool blockThreadMustDivideGridThreadExtent = true,
140 GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
142 {
143 using Vec = Vec<TDim, TIdx>;
144 using DimLoopInd = typename TDim::value_type;
145
146 for(DimLoopInd i(0); i < TDim::value; ++i)
147 {
148 ALPAKA_ASSERT(gridElemExtent[i] >= 1);
149 ALPAKA_ASSERT(threadElemExtent[i] >= 1);
150 ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
151 }
152 ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
153 ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
154
155 // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
156 // optimized.
157 auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
158 auto const gridThreadExtent = [&]
159 {
160 Vec r;
161 for(DimLoopInd i(0u); i < TDim::value; ++i)
162 r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
163 return r;
164 }();
165
166 ///////////////////////////////////////////////////////////////////
167 // Try to calculate an optimal blockThreadExtent.
168
169 // Restrict the max block thread extent from the maximum possible to the grid thread extent.
170 // This removes dimensions not required in the grid thread extent.
171 // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
172 auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
173
174 // For equal block thread extent, restrict it to its minimum component.
175 // For example (512, 256, 1024) will get (256, 256, 256).
176 if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
177 blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
178
179 // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
180 TIdx const& blockThreadCountMax
181 = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
182
183 // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
184 // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
185 // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
186 // enough.
187 for(typename TDim::value_type i(0); i < TDim::value; ++i)
188 {
189 blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
190 }
191
192 // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
193 if(blockThreadCountMax == 1)
194 {
195 blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
196 }
197 else if(blockThreadExtent.prod() > blockThreadCountMax)
198 {
199 switch(gridBlockExtentSubDivRestrictions)
200 {
202 blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
203 break;
205 // Very primitive clipping. Just halve the largest value until it fits.
206 while(blockThreadExtent.prod() > blockThreadCountMax)
207 blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
208 break;
210 // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
211 while(blockThreadExtent.prod() > blockThreadCountMax)
212 {
213 auto const it = std::min_element(
214 blockThreadExtent.begin(),
215 blockThreadExtent.end() - 1, //! \todo why omit the last element?
216 [](TIdx const& a, TIdx const& b)
217 {
218 if(a == TIdx{1})
219 return false;
220 if(b == TIdx{1})
221 return true;
222 return a < b;
223 });
224 *it /= TIdx{2};
225 }
226 break;
227 }
228 }
229
230
231 // Make the block thread extent divide the grid thread extent.
232 if(blockThreadMustDivideGridThreadExtent)
233 {
234 switch(gridBlockExtentSubDivRestrictions)
235 {
237 {
238 // For equal size block extent we have to compute the gcd of all grid thread extent that is less
239 // then the current maximal block thread extent. For this we compute the divisors of all grid
240 // thread extent less then the current maximal block thread extent.
241 std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
242 for(DimLoopInd i(0u); i < TDim::value; ++i)
243 {
244 gridThreadExtentDivisors[i]
245 = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
246 }
247 // The maximal common divisor of all block thread extent is the optimal solution.
248 std::set<TIdx> intersects[2u];
249 for(DimLoopInd i(1u); i < TDim::value; ++i)
250 {
251 intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
252 intersects[(i) % 2u].clear();
253 set_intersection(
254 std::begin(intersects[(i - 1u) % 2u]),
255 std::end(intersects[(i - 1u) % 2u]),
256 std::begin(gridThreadExtentDivisors[i]),
257 std::end(gridThreadExtentDivisors[i]),
258 std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
259 }
260 TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
261 blockThreadExtent = Vec::all(maxCommonDivisor);
262 break;
263 }
265 [[fallthrough]];
267 for(DimLoopInd i(0u); i < TDim::value; ++i)
268 {
269 blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
270 }
271 break;
272 }
273 }
274
275 // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
276 auto gridBlockExtent = [&]
277 {
278 Vec r;
279 for(DimLoopInd i = 0; i < TDim::value; ++i)
280 r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
281 return r;
282 }();
283
284
285 // Store the maxima allowed for extents of grid, blocks and threads.
286 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
287 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
288 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
289
290 // Check that the extents for all dimensions are correct.
291 for(typename TDim::value_type i(0); i < TDim::value; ++i)
292 {
293 // Check that the maximum extents are greater or equal 1.
294 if(gridBlockExtentMax[i] < gridBlockExtent[i])
295 {
296 gridBlockExtent[i] = gridBlockExtentMax[i];
297 }
298 if(blockThreadExtentMax[i] < blockThreadExtent[i])
299 {
300 blockThreadExtent[i] = blockThreadExtentMax[i];
301 }
302 if(threadElemExtentMax[i] < threadElemExtent[i])
303 {
304 clippedThreadElemExtent[i] = threadElemExtentMax[i];
305 }
306 }
307
308 return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
309 }
310
311 //! Kernel start configuration to determine a valid work division
312 //!
313 //! \tparam TGridElemExtent The type of the grid element extent.
314 //! \tparam TThreadElemExtent The type of the thread element extent.
315 template<
316 typename TAcc,
317 typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
318 typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
320 {
321 //! The full extent of elements in the grid.
322 TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
323 //! The number of elements computed per thread.
324 TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
325 //! If this is true, the grid thread extent will be multiples of
326 //! the corresponding block thread extent.
327 //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
328 //! thread extent will be one in this dimension.
329 bool blockThreadMustDivideGridThreadExtent = true;
330 //! The grid block extent subdivision restrictions.
331 GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
332 = GridBlockExtentSubDivRestrictions::Unrestricted;
333
334 static_assert(
336 "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
337 static_assert(
339 "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
340 static_assert(
341 std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
342 "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
343 static_assert(
344 std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
345 "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
346 };
347
348 //! \tparam TDev The type of the device.
349 //! \tparam TGridElemExtent The type of the grid element extent.
350 //! \tparam TThreadElemExtent The type of the thread element extent.
351 //! \param dev The device the work division should be valid for.
352 //! \param kernelFnObj The kernel function object which should be executed.
353 //! \param args The kernel invocation arguments.
354 //! \return The work division for the accelerator based on the kernel and argument types
355 template<
356 typename TAcc,
357 typename TDev,
358 typename TGridElemExtent,
359 typename TThreadElemExtent,
360 typename TKernelFnObj,
361 typename... TArgs>
364 [[maybe_unused]] TDev const& dev,
365 TKernelFnObj const& kernelFnObj,
366 TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
367 {
368 using Acc = TAcc;
369
370 // Get max number of threads per block depending on the kernel function attributes.
371 // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
372 // determines the max number of threads per block. This number could be equal or less than the max number of
373 // threads per block defined by device properties.
374 auto const kernelFunctionAttributes
375 = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
376 auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
377
378 if constexpr(Dim<TGridElemExtent>::value == 0)
379 {
380 auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
381 ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
382 ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
383 return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
384 }
385 else
386 return subDivideGridElems(
387 getExtents(kernelCfg.gridElemExtent),
388 getExtents(kernelCfg.threadElemExtent),
389 getAccDevProps<Acc>(dev),
390 static_cast<Idx<Acc>>(threadsPerBlock),
391 kernelCfg.blockThreadMustDivideGridThreadExtent,
392 kernelCfg.gridBlockExtentSubDivRestrictions);
393
394 using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
396 }
397
398 //! Checks if the work division is supported
399 //!
400 //! \tparam TWorkDiv The type of the work division.
401 //! \tparam TDim The dimensionality of the accelerator device properties.
402 //! \tparam TIdx The idx type of the accelerator device properties.
403 //! \param workDiv The work division to test for validity.
404 //! \param accDevProps The maxima for the work division.
405 //! \return If the work division is valid for the given accelerator device properties.
406 template<typename TWorkDiv, typename TDim, typename TIdx>
407 ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
408 {
409 // Get the extents of grid, blocks and threads of the work division to check.
410 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
411 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
412 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
413
414 // Check that the maximal counts are satisfied.
415 if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
416 {
417 return false;
418 }
419 if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
420 {
421 return false;
422 }
423 if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
424 {
425 return false;
426 }
427
428 // Check that the extents for all dimensions are correct.
429 if constexpr(Dim<TWorkDiv>::value > 0)
430 {
431 // Store the maxima allowed for extents of grid, blocks and threads.
432 auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
433 auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
434 auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
435
436 for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
437 {
438 // No extent is allowed to be zero or greater then the allowed maximum.
439 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
440 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
441 || (threadElemExtentMax[i] < threadElemExtent[i]))
442 {
443 return false;
444 }
445 }
446 }
447
448 return true;
449 }
450
451 //! Checks if the work division is supported
452 //!
453 //! \tparam TWorkDiv The type of the work division.
454 //! \tparam TDim The dimensionality of the accelerator device properties.
455 //! \tparam TIdx The idx type of the accelerator device properties.
456 //! \param workDiv The work division to test for validity.
457 //! \param accDevProps The maxima for the work division.
458 //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
459 //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
460 //! threads per block supported by the device.
461 //! \return Returns true if the work division is valid for the given accelerator device properties and for the
462 //! given kernel. Otherwise returns false.
463 template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
465 TWorkDiv const& workDiv,
466 AccDevProps<TDim, TIdx> const& accDevProps,
467 KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
468 {
469 // Get the extents of grid, blocks and threads of the work division to check.
470 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
471 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
472 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
473 // Use kernel properties to find the max threads per block for the kernel
474 auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
475 // Select the minimum to find the upper bound for the threads per block
476 auto const allowedThreadsPerBlock = std::min(
477 static_cast<TIdx>(threadsPerBlockForKernel),
478 static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
479 // Check that the maximal counts are satisfied.
480 if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
481 {
482 return false;
483 }
484 if(allowedThreadsPerBlock < blockThreadExtent.prod())
485 {
486 return false;
487 }
488 if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
489 {
490 return false;
491 }
492
493 // Check that the extents for all dimensions are correct.
494 if constexpr(Dim<TWorkDiv>::value > 0)
495 {
496 // Store the maxima allowed for extents of grid, blocks and threads.
497 auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
498 auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
499 auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
500
501 for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
502 {
503 // No extent is allowed to be zero or greater then the allowed maximum.
504 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
505 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
506 || (threadElemExtentMax[i] < threadElemExtent[i]))
507 {
508 return false;
509 }
510 }
511 }
512
513 return true;
514 }
515
516 //! Checks if the work division is supported for the kernel on the device
517 //!
518 //! \tparam TAcc The accelerator to test the validity on.
519 //! \tparam TDev The type of the device.
520 //! \tparam TWorkDiv The type of work division to test for validity.
521 //! \param workDiv The work division to test for validity.
522 //! \param dev The device to test the work division for validity on.
523 //! \param kernelFnObj The kernel function object which should be executed.
524 //! \param args The kernel invocation arguments.
525 //! \return Returns the value of isValidWorkDiv function.
526 template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
528 TWorkDiv const& workDiv,
529 TDev const& dev,
530 TKernelFnObj const& kernelFnObj,
531 TArgs&&... args) -> bool
532 {
533 return isValidWorkDiv<TAcc>(
534 workDiv,
535 getAccDevProps<TAcc>(dev),
536 getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
537 }
538
539 //! Checks if the work division is supported by the device
540 //!
541 //! \tparam TAcc The accelerator to test the validity on.
542 //! \param workDiv The work division to test for validity.
543 //! \param dev The device to test the work division for validity on.
544 //! \return If the work division is valid on this accelerator.
545 template<typename TAcc, typename TWorkDiv, typename TDev>
546 ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
547 {
548 return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
549 }
550} // namespace alpaka
551
552#if BOOST_COMP_CLANG
553# pragma clang diagnostic pop
554#endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition Assert.hpp:13
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition Vec.hpp:89
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
Definition Assert.hpp:77
ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
Definition Utility.hpp:46
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition Utility.hpp:27
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const &dividend, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc > >
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, AccDevProps< TDim, TIdx > const &accDevProps) -> bool
Checks if the work division is supported.
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T > >
Definition Traits.hpp:59
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition Vec.hpp:541
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
Definition Traits.hpp:78
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...