alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
WorkDivHelpers.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/dev/Traits.hpp"
15#include "alpaka/vec/Vec.hpp"
17
18#include <algorithm>
19#include <array>
20#include <cmath>
21#include <functional>
22#include <iterator>
23#include <set>
24#include <type_traits>
25
26#if ALPAKA_COMP_CLANG
27# pragma clang diagnostic push
28# pragma clang diagnostic ignored "-Wswitch-default"
29#endif
30
31//! The alpaka library.
32namespace alpaka
33{
34 //! The grid block extent subdivision restrictions.
36 {
37 EqualExtent, //!< The block thread extent will be equal in all dimensions.
38 CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
39 Unrestricted, //!< The block thread extent will not have any restrictions.
40 };
41
42 namespace detail
43 {
44 //! Finds the largest divisor where divident % divisor == 0
45 //! \param dividend The dividend.
46 //! \param maxDivisor The maximum divisor.
47 //! \return The biggest number that satisfies the following conditions:
48 //! 1) dividend%ret==0
49 //! 2) ret<=maxDivisor
50 template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
51 ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
52 {
54 core::assertValueUnsigned(maxDivisor);
55 ALPAKA_ASSERT(dividend >= maxDivisor);
56
57 T divisor = maxDivisor;
58 while(dividend % divisor != 0)
59 --divisor;
60 return divisor;
61 }
62
63 //! \param val The value to find divisors of.
64 //! \param maxDivisor The maximum.
65 //! \return A list of all divisors less then or equal to the given maximum.
66 template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
67 ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
68 {
69 std::set<T> divisorSet;
70
72 core::assertValueUnsigned(maxDivisor);
73 ALPAKA_ASSERT(maxDivisor <= val);
74
75 for(T i(1); i <= std::min(val, maxDivisor); ++i)
76 {
77 if(val % i == 0)
78 {
79 divisorSet.insert(static_cast<T>(val / i));
80 }
81 }
82
83 return divisorSet;
84 }
85 } // namespace detail
86
87 //! \tparam TDim The dimensionality of the accelerator device properties.
88 //! \tparam TIdx The idx type of the accelerator device properties.
89 //! \param accDevProps The maxima for the work division.
90 //! \return If the accelerator device properties are valid.
91 template<typename TDim, typename TIdx>
93 {
94 // Check that the maximum counts are greater or equal 1.
95 if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
96 || (accDevProps.m_threadElemCountMax < 1))
97 {
98 return false;
99 }
100
101 // Store the maxima allowed for extents of grid, blocks and threads.
102 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
103 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
104 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
105
106 // Check that the extents for all dimensions are correct.
107 for(typename TDim::value_type i(0); i < TDim::value; ++i)
108 {
109 // Check that the maximum extents are greater or equal 1.
110 if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
111 {
112 return false;
113 }
114 }
115
116 return true;
117 }
118
119 //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
120 //! 1. The the maxima block, thread and element extent and counts
121 //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
122 //! 3. The requirement of the block extent.
123 //!
124 //! \param gridElemExtent The full extent of elements in the grid.
125 //! \param threadElemExtent the number of elements computed per thread.
126 //! \param accDevProps The maxima for the work division.
127 //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
128 //! used, device hard limits are used.
129 //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
130 //! corresponding block thread extent.
131 //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
132 //! thread extent will be one in this dimension.
133 //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
134 template<typename TDim, typename TIdx>
136 Vec<TDim, TIdx> const& gridElemExtent,
137 Vec<TDim, TIdx> const& threadElemExtent,
138 AccDevProps<TDim, TIdx> const& accDevProps,
139 TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
140 bool blockThreadMustDivideGridThreadExtent = true,
141 GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
143 {
144 using Vec = Vec<TDim, TIdx>;
145 using DimLoopInd = typename TDim::value_type;
146
147 for(DimLoopInd i(0); i < TDim::value; ++i)
148 {
149 ALPAKA_ASSERT(gridElemExtent[i] >= 1);
150 ALPAKA_ASSERT(threadElemExtent[i] >= 1);
151 ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
152 }
153 ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
154 ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
155
156 // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
157 // optimized.
158 auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
159 auto const gridThreadExtent = [&]
160 {
161 Vec r;
162 for(DimLoopInd i(0u); i < TDim::value; ++i)
163 r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
164#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
165# pragma clang diagnostic push
166# pragma clang diagnostic ignored "-Wnrvo"
167#endif
168 return r;
169#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
170# pragma clang diagnostic pop
171#endif
172 }();
173
174 ///////////////////////////////////////////////////////////////////
175 // Try to calculate an optimal blockThreadExtent.
176
177 // Restrict the max block thread extent from the maximum possible to the grid thread extent.
178 // This removes dimensions not required in the grid thread extent.
179 // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
180 auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
181
182 // For equal block thread extent, restrict it to its minimum component.
183 // For example (512, 256, 1024) will get (256, 256, 256).
184 if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
185 blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
186
187 // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
188 TIdx const& blockThreadCountMax
189 = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
190
191 // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
192 // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
193 // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
194 // enough.
195 for(typename TDim::value_type i(0); i < TDim::value; ++i)
196 {
197 blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
198 }
199
200 // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
201 if(blockThreadCountMax == 1)
202 {
203 blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
204 }
205 else if(blockThreadExtent.prod() > blockThreadCountMax)
206 {
207#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
208# pragma clang diagnostic push
209# pragma clang diagnostic ignored "-Wswitch-default"
210#endif
211 switch(gridBlockExtentSubDivRestrictions)
212 {
214 blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
215 break;
217 // Very primitive clipping. Just halve the largest value until it fits.
218 while(blockThreadExtent.prod() > blockThreadCountMax)
219 blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
220 break;
222 // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
223 while(blockThreadExtent.prod() > blockThreadCountMax)
224 {
225 auto const it = std::min_element(
226 blockThreadExtent.begin(),
227 blockThreadExtent.end() - 1, //! \todo why omit the last element?
228 [](TIdx const& a, TIdx const& b)
229 {
230 if(a == TIdx{1})
231 return false;
232 if(b == TIdx{1})
233 return true;
234 return a < b;
235 });
236 *it /= TIdx{2};
237 }
238 break;
239 }
240#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
241# pragma clang diagnostic pop
242#endif
243 }
244
245
246 // Make the block thread extent divide the grid thread extent.
247 if(blockThreadMustDivideGridThreadExtent)
248 {
249#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
250# pragma clang diagnostic push
251# pragma clang diagnostic ignored "-Wswitch-default"
252#endif
253 switch(gridBlockExtentSubDivRestrictions)
254 {
256 {
257 // For equal size block extent we have to compute the gcd of all grid thread extent that is less
258 // then the current maximal block thread extent. For this we compute the divisors of all grid
259 // thread extent less then the current maximal block thread extent.
260 std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
261 for(DimLoopInd i(0u); i < TDim::value; ++i)
262 {
263 gridThreadExtentDivisors[i]
264 = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
265 }
266 // The maximal common divisor of all block thread extent is the optimal solution.
267 std::set<TIdx> intersects[2u];
268 for(DimLoopInd i(1u); i < TDim::value; ++i)
269 {
270 intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
271 intersects[(i) % 2u].clear();
272 set_intersection(
273 std::begin(intersects[(i - 1u) % 2u]),
274 std::end(intersects[(i - 1u) % 2u]),
275 std::begin(gridThreadExtentDivisors[i]),
276 std::end(gridThreadExtentDivisors[i]),
277 std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
278 }
279 TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
280 blockThreadExtent = Vec::all(maxCommonDivisor);
281 break;
282 }
284 [[fallthrough]];
286 for(DimLoopInd i(0u); i < TDim::value; ++i)
287 {
288 blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
289 }
290 break;
291 }
292#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
293# pragma clang diagnostic pop
294#endif
295 }
296
297 // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
298 auto gridBlockExtent = [&]
299 {
300 Vec r;
301 for(DimLoopInd i = 0; i < TDim::value; ++i)
302 r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
303#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
304# pragma clang diagnostic push
305# pragma clang diagnostic ignored "-Wnrvo"
306#endif
307 return r;
308#if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
309# pragma clang diagnostic pop
310#endif
311 }();
312
313
314 // Store the maxima allowed for extents of grid, blocks and threads.
315 auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
316 auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
317 auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
318
319 // Check that the extents for all dimensions are correct.
320 for(typename TDim::value_type i(0); i < TDim::value; ++i)
321 {
322 // Check that the maximum extents are greater or equal 1.
323 if(gridBlockExtentMax[i] < gridBlockExtent[i])
324 {
325 gridBlockExtent[i] = gridBlockExtentMax[i];
326 }
327 if(blockThreadExtentMax[i] < blockThreadExtent[i])
328 {
329 blockThreadExtent[i] = blockThreadExtentMax[i];
330 }
331 if(threadElemExtentMax[i] < threadElemExtent[i])
332 {
333 clippedThreadElemExtent[i] = threadElemExtentMax[i];
334 }
335 }
336
337 return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
338 }
339
340 //! Kernel start configuration to determine a valid work division
341 //!
342 //! \tparam TGridElemExtent The type of the grid element extent.
343 //! \tparam TThreadElemExtent The type of the thread element extent.
344 template<
345 typename TAcc,
346 typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
347 typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
349 {
350 //! The full extent of elements in the grid.
351 TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
352 //! The number of elements computed per thread.
353 TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
354 //! If this is true, the grid thread extent will be multiples of
355 //! the corresponding block thread extent.
356 //! NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
357 //! thread extent will be one in this dimension.
358 bool blockThreadMustDivideGridThreadExtent = true;
359 //! The grid block extent subdivision restrictions.
360 GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
361 = GridBlockExtentSubDivRestrictions::Unrestricted;
362
363 static_assert(
365 "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
366 static_assert(
368 "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
369 static_assert(
370 std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
371 "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
372 static_assert(
373 std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
374 "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
375 };
376
377 //! \tparam TDev The type of the device.
378 //! \tparam TGridElemExtent The type of the grid element extent.
379 //! \tparam TThreadElemExtent The type of the thread element extent.
380 //! \param dev The device the work division should be valid for.
381 //! \param kernelFnObj The kernel function object which should be executed.
382 //! \param args The kernel invocation arguments.
383 //! \return The work division for the accelerator based on the kernel and argument types
384 template<
385 typename TAcc,
386 typename TDev,
387 typename TGridElemExtent,
388 typename TThreadElemExtent,
389 typename TKernelFnObj,
390 typename... TArgs>
393 [[maybe_unused]] TDev const& dev,
394 TKernelFnObj const& kernelFnObj,
395 TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
396 {
397 using Acc = TAcc;
398
399 // Get max number of threads per block depending on the kernel function attributes.
400 // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
401 // determines the max number of threads per block. This number could be equal or less than the max number of
402 // threads per block defined by device properties.
403 auto const kernelFunctionAttributes
404 = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
405 auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
406
407 if constexpr(Dim<TGridElemExtent>::value == 0)
408 {
409 auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
410 ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
411 ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
412 return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
413 }
414 else
415 return subDivideGridElems(
416 getExtents(kernelCfg.gridElemExtent),
417 getExtents(kernelCfg.threadElemExtent),
418 getAccDevProps<Acc>(dev),
419 static_cast<Idx<Acc>>(threadsPerBlock),
420 kernelCfg.blockThreadMustDivideGridThreadExtent,
421 kernelCfg.gridBlockExtentSubDivRestrictions);
422
423 using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
425 }
426
427 //! Checks if the work division is supported
428 //!
429 //! \tparam TWorkDiv The type of the work division.
430 //! \tparam TDim The dimensionality of the accelerator device properties.
431 //! \tparam TIdx The idx type of the accelerator device properties.
432 //! \param workDiv The work division to test for validity.
433 //! \param accDevProps The maxima for the work division.
434 //! \return If the work division is valid for the given accelerator device properties.
435 template<typename TWorkDiv, typename TDim, typename TIdx>
436 ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
437 {
438 // Get the extents of grid, blocks and threads of the work division to check.
439 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
440 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
441 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
442
443 // Check that the maximal counts are satisfied.
444 if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
445 {
446 return false;
447 }
448 if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
449 {
450 return false;
451 }
452 if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
453 {
454 return false;
455 }
456
457 // Check that the extents for all dimensions are correct.
458 if constexpr(Dim<TWorkDiv>::value > 0)
459 {
460 // Store the maxima allowed for extents of grid, blocks and threads.
461 auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
462 auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
463 auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
464
465 for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
466 {
467 // No extent is allowed to be zero or greater then the allowed maximum.
468 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
469 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
470 || (threadElemExtentMax[i] < threadElemExtent[i]))
471 {
472 return false;
473 }
474 }
475 }
476
477 return true;
478 }
479
480 //! Checks if the work division is supported
481 //!
482 //! \tparam TWorkDiv The type of the work division.
483 //! \tparam TDim The dimensionality of the accelerator device properties.
484 //! \tparam TIdx The idx type of the accelerator device properties.
485 //! \param workDiv The work division to test for validity.
486 //! \param accDevProps The maxima for the work division.
487 //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
488 //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
489 //! threads per block supported by the device.
490 //! \return Returns true if the work division is valid for the given accelerator device properties and for the
491 //! given kernel. Otherwise returns false.
492 template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
494 TWorkDiv const& workDiv,
495 AccDevProps<TDim, TIdx> const& accDevProps,
496 KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
497 {
498 // Get the extents of grid, blocks and threads of the work division to check.
499 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
500 auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
501 auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
502 // Use kernel properties to find the max threads per block for the kernel
503 auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
504 // Select the minimum to find the upper bound for the threads per block
505 auto const allowedThreadsPerBlock = std::min(
506 static_cast<TIdx>(threadsPerBlockForKernel),
507 static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
508 // Check that the maximal counts are satisfied.
509 if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
510 {
511 return false;
512 }
513 if(allowedThreadsPerBlock < blockThreadExtent.prod())
514 {
515 return false;
516 }
517 if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
518 {
519 return false;
520 }
521
522 // Check that the extents for all dimensions are correct.
523 if constexpr(Dim<TWorkDiv>::value > 0)
524 {
525 // Store the maxima allowed for extents of grid, blocks and threads.
526 auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
527 auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
528 auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
529
530 for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
531 {
532 // No extent is allowed to be zero or greater then the allowed maximum.
533 if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
534 || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
535 || (threadElemExtentMax[i] < threadElemExtent[i]))
536 {
537 return false;
538 }
539 }
540 }
541
542 return true;
543 }
544
545 //! Checks if the work division is supported for the kernel on the device
546 //!
547 //! \tparam TAcc The accelerator to test the validity on.
548 //! \tparam TDev The type of the device.
549 //! \tparam TWorkDiv The type of work division to test for validity.
550 //! \param workDiv The work division to test for validity.
551 //! \param dev The device to test the work division for validity on.
552 //! \param kernelFnObj The kernel function object which should be executed.
553 //! \param args The kernel invocation arguments.
554 //! \return Returns the value of isValidWorkDiv function.
555 template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
557 TWorkDiv const& workDiv,
558 TDev const& dev,
559 TKernelFnObj const& kernelFnObj,
560 TArgs&&... args) -> bool
561 {
562 return isValidWorkDiv<TAcc>(
563 workDiv,
564 getAccDevProps<TAcc>(dev),
565 getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
566 }
567
568 //! Checks if the work division is supported by the device
569 //!
570 //! \tparam TAcc The accelerator to test the validity on.
571 //! \param workDiv The work division to test for validity.
572 //! \param dev The device to test the work division for validity on.
573 //! \return If the work division is valid on this accelerator.
574 template<typename TAcc, typename TWorkDiv, typename TDev>
575 ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
576 {
577 return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
578 }
579} // namespace alpaka
580
581#if ALPAKA_COMP_CLANG
582# pragma clang diagnostic pop
583#endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition Assert.hpp:13
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition Vec.hpp:89
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:43
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const &arg) -> void
This method checks integral values if they are greater or equal zero. The implementation prevents war...
Definition Assert.hpp:77
ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
Computes the floor of the nth root of value, in integers.
Definition Utility.hpp:46
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition Utility.hpp:27
ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const &dividend, T const &maxDivisor) -> T
Finds the largest divisor where divident % divisor == 0.
ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const &val, T const &maxDivisor) -> std::set< T >
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_FN_HOST auto getValidWorkDiv(KernelCfg< TAcc, TGridElemExtent, TThreadElemExtent > const &kernelCfg, TDev const &dev, TKernelFnObj const &kernelFnObj, TArgs &&... args) -> WorkDivMembers< Dim< TAcc >, Idx< TAcc > >
ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps< TDim, TIdx > const &accDevProps) -> bool
ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const &workDiv, AccDevProps< TDim, TIdx > const &accDevProps) -> bool
Checks if the work division is supported.
GridBlockExtentSubDivRestrictions
The grid block extent subdivision restrictions.
@ Unrestricted
The block thread extent will not have any restrictions.
@ CloseToEqualExtent
The block thread extent will be as close to equal as possible in all dimensions.
@ EqualExtent
The block thread extent will be equal in all dimensions.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T > >
Definition Traits.hpp:59
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition Vec.hpp:554
typename trait::AccType< T >::type Acc
The accelerator type trait alias template to remove the ::type.
Definition Traits.hpp:83
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_FN_HOST auto subDivideGridElems(Vec< TDim, TIdx > const &gridElemExtent, Vec< TDim, TIdx > const &threadElemExtent, AccDevProps< TDim, TIdx > const &accDevProps, TIdx kernelBlockThreadCountMax=static_cast< TIdx >(0u), bool blockThreadMustDivideGridThreadExtent=true, GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions=GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers< TDim, TIdx >
Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
The acceleration properties on a device.
TIdx m_gridBlockCountMax
The maximum number of blocks in a grid.
Vec< TDim, TIdx > m_gridBlockExtentMax
The maximum number of blocks in each dimension of the grid.
TIdx m_threadElemCountMax
The maximum number of elements in a threads.
Vec< TDim, TIdx > m_blockThreadExtentMax
The maximum number of threads in each dimension of a block.
Vec< TDim, TIdx > m_threadElemExtentMax
The maximum number of elements in each dimension of a thread.
TIdx m_blockThreadCountMax
The maximum number of threads in a block.
Kernel start configuration to determine a valid work division.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...