alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
UniformElements.hpp
Go to the documentation of this file.
1#pragma once
2
7
8#include <algorithm>
9#include <cstddef>
10#include <type_traits>
11
12namespace alpaka
13{
14
15 namespace detail
16 {
17
18 /* UniformElementsAlong
19 *
20 * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
21 * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
22 * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
23 * size along the `Dim` dimension.
24 *
25 * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
26 * can infer the accelerator type from the argument.
27 *
28 * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
29 * 0>(acc, ...)`.
30 *
31 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
32 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
33 * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
34 * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
35 *
36 * To cover the problem space, different threads may execute a different number of iterations. As a result, it
37 * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
38 * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
39 * loop over each group's elements, and synchronise only in the outer loop:
40 *
41 * for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
42 * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
43 * // first part of the computation
44 * // no synchronisations here
45 * ...
46 * }
47 * // wait for all threads to complete the first part
48 * alpaka::syncBlockThreads();
49 * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
50 * // second part of the computation
51 * // no synchronisations here
52 * ...
53 * }
54 * // wait for all threads to complete the second part
55 * alpaka::syncBlockThreads();
56 * ...
57 * }
58 *
59 * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
60 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
61 * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
62 * check the element index explicitly inside the loop:
63 *
64 * for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
65 * bool flag = false;
66 * if (element < extent) {
67 * // do some work and compute a result flag only for the valid elements
68 * flag = do_some_work();
69 * }
70 * // check if any valid element had a positive result
71 * if (alpaka::warp::any(acc, flag)) {
72 * // ...
73 * }
74 * }
75 *
76 * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
77 * `N-1`.
78 */
79
80 template<concepts::Acc TAcc, std::size_t Dim>
81 requires(alpaka::Dim<TAcc>::value >= Dim)
83 {
84 public:
86
87 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
88 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
89 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
90 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
91 , extent_{stride_}
92 {
93 }
94
95 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
96 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
97 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
98 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
99 , extent_{extent}
100 {
101 }
102
103 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
104 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
105 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
106 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
107 , extent_{extent}
108 {
109 }
110
111 class const_iterator;
113
115 {
116 return const_iterator(elements_, stride_, extent_, first_);
117 }
118
120 {
121 return const_iterator(elements_, stride_, extent_, extent_);
122 }
123
125 {
127
128 ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
129 : elements_{elements}
130 ,
131 // we need to reduce the stride by on element range because index_ is later increased with each
132 // increment
133 stride_{stride - elements}
134 , extent_{extent}
135 , index_{std::min(first, extent)}
136 {
137 }
138
139 public:
141 {
142 return index_;
143 }
144
145 // pre-increment the iterator
147 {
148 // increment the index along the elements processed by the current thread
149 ++indexElem_;
150 ++index_;
151 if(indexElem_ >= elements_)
152 {
153 indexElem_ = 0;
154 index_ += stride_;
155 }
156 if(index_ >= extent_)
157 index_ = extent_;
158
159 return *this;
160 }
161
162 // post-increment the iterator
164 {
165 const_iterator old = *this;
166 ++(*this);
167 return old;
168 }
169
170 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
171 {
172 return (*(*this) == *other);
173 }
174
175 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
176 {
177 return not(*this == other);
178 }
179
180 private:
181 // non-const to support iterator copy and assignment
182 Idx elements_;
183 Idx stride_;
184 Idx extent_;
185 // modified by the pre/post-increment operator
186 Idx index_;
187 Idx indexElem_ = 0;
188 };
189
190 private:
191 Idx const elements_;
192 Idx const first_;
193 Idx const stride_;
194 Idx const extent_;
195 };
196
197 } // namespace detail
198
199 /* uniformElements
200 *
201 * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
202 * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
203 * `extent` is not specified, it defaults to the kernel grid size.
204 *
205 * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
206 *
207 * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
208 * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
209 * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
210 * over each group's elements, and synchronise only in the outer loop:
211 *
212 * for (auto group : uniformGroups(acc, extent)) {
213 * for (auto element : uniformGroupElements(acc, group, extent)) {
214 * // first part of the computation
215 * // no synchronisations here
216 * ...
217 * }
218 * // wait for all threads to complete the first part
219 * alpaka::syncBlockThreads();
220 * for (auto element : uniformGroupElements(acc, group, extent)) {
221 * // second part of the computation
222 * // no synchronisations here
223 * ...
224 * }
225 * // wait for all threads to complete the second part
226 * alpaka::syncBlockThreads();
227 * ...
228 * }
229 *
230 * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
231 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
232 * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
233 * element index explicitly inside the loop:
234 *
235 * for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
236 * bool flag = false;
237 * if (element < extent) {
238 * // do some work and compute a result flag only for elements up to extent
239 * flag = do_some_work();
240 * }
241 * // check if any valid element had a positive result
242 * if (alpaka::warp::any(acc, flag)) {
243 * // ...
244 * }
245 * }
246 *
247 * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
248 * use
249 * - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
250 * - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
251 * - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
252 * to loop along the fastest, second-fastest, or third-fastest dimension.
253 */
254
255 template<concepts::Acc TAcc, typename... TArgs>
256 requires(alpaka::Dim<TAcc>::value == 1)
257 ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
258 {
259 using Idx = alpaka::Idx<TAcc>;
260 return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
261 }
262
263 /* uniformElementsAlong<Dim>
264 *
265 * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
266 * that can infer the accelerator type from the argument.
267 */
268
269 template<std::size_t Dim, concepts::Acc TAcc, typename... TArgs>
270 requires(alpaka::Dim<TAcc>::value >= Dim)
271 ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
272 {
273 using Idx = alpaka::Idx<TAcc>;
274 return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
275 }
276
277 /* uniformElementsAlongX, Y, Z
278 *
279 * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
280 * dimensions.
281 */
282
283 template<concepts::Acc TAcc, typename... TArgs>
284 requires(alpaka::Dim<TAcc>::value > 0)
285 ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
286 {
287 using Idx = alpaka::Idx<TAcc>;
288 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
289 }
290
291 template<concepts::Acc TAcc, typename... TArgs>
292 requires(alpaka::Dim<TAcc>::value > 1)
293 ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
294 {
295 using Idx = alpaka::Idx<TAcc>;
296 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
297 }
298
299 template<concepts::Acc TAcc, typename... TArgs>
300 requires(alpaka::Dim<TAcc>::value > 2)
301 ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
302 {
303 using Idx = alpaka::Idx<TAcc>;
304 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
305 }
306
307 namespace detail
308 {
309
310 /* UniformElementsND
311 *
312 * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
313 * required to cover the given problem size, indicated by `extent`.
314 *
315 * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
316 *
317 * To cover the problem space, different threads may execute a different number of iterations. As a result, it
318 * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
319 * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
320 * loop over each group's elements, and synchronise only in the outer loop:
321 *
322 * for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
323 * for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
324 * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
325 * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
326 * // first part of the computation
327 * // no synchronisations here
328 * ...
329 * }
330 * }
331 * // wait for all threads to complete the first part
332 * alpaka::syncBlockThreads();
333 * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
334 * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
335 * // second part of the computation
336 * // no synchronisations here
337 * ...
338 * }
339 * }
340 * // wait for all threads to complete the second part
341 * alpaka::syncBlockThreads();
342 * ...
343 * }
344 * }
345 *
346 * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
347 */
348
349 template<concepts::Acc TAcc>
350 requires(alpaka::Dim<TAcc>::value > 0)
351 class UniformElementsND
352 {
353 public:
354 using Dim = alpaka::Dim<TAcc>;
355 using Idx = alpaka::Idx<TAcc>;
356 using Vec = alpaka::Vec<Dim, Idx>;
357
358 ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
359 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
360 , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
361 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
362 , extent_{stride_}
363 {
364 }
365
366 ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
367 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
368 , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
369 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
370 , extent_{extent}
371 {
372 }
373
374 // tag used to construct an end iterator
375 struct at_end_t
376 {
377 };
378
379 class const_iterator;
380 using iterator = const_iterator;
381
382 ALPAKA_FN_ACC inline const_iterator begin() const
383 {
384 // check that all dimensions of the current thread index are within the extent
385 if((thread_ < extent_).all())
386 {
387 // construct an iterator pointing to the first element to be processed by the current thread
388 return const_iterator{this, thread_};
389 }
390 else
391 {
392 // construct an end iterator, pointing post the end of the extent
393 return const_iterator{this, at_end_t{}};
394 }
395 }
396
397 ALPAKA_FN_ACC inline const_iterator end() const
398 {
399 // construct an end iterator, pointing post the end of the extent
400 return const_iterator{this, at_end_t{}};
401 }
402
404 {
405 friend class UniformElementsND;
406
407 public:
409 {
410 return index_;
411 }
412
413 // pre-increment the iterator
415 {
416 increment();
417 return *this;
418 }
419
420 // post-increment the iterator
422 {
423 const_iterator old = *this;
424 increment();
425 return old;
426 }
427
428 ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
429 {
430 return (index_ == other.index_);
431 }
432
433 ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
434 {
435 return not(*this == other);
436 }
437
438 private:
439 // construct an iterator pointing to the first element to be processed by the current thread
440 ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
441 : loop_{loop}
442 , first_{alpaka::elementwise_min(first, loop->extent_)}
443 , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
444 , index_{first_}
445 {
446 }
447
448 // construct an end iterator, pointing post the end of the extent
449 ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
450 : loop_{loop}
451 , first_{loop_->extent_}
452 , range_{loop_->extent_}
453 , index_{loop_->extent_}
454 {
455 }
456
457 template<size_t I>
458 ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
459 {
460 bool overflow = false;
461 ++index_[I];
462 if(index_[I] >= range_[I])
463 {
464 index_[I] = first_[I];
465 overflow = true;
466 }
467 return overflow;
468 }
469
470 template<size_t N>
471 ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
472 {
473 if constexpr(N == 0)
474 {
475 // overflow
476 return true;
477 }
478 else
479 {
480 if(not nth_elements_loop<N - 1>())
481 {
482 return false;
483 }
484 else
485 {
486 return do_elements_loops<N - 1>();
487 }
488 }
489 ALPAKA_UNREACHABLE(false);
490 }
491
492 template<size_t I>
493 ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
494 {
495 bool overflow = false;
496 first_[I] += loop_->stride_[I];
497 if(first_[I] >= loop_->extent_[I])
498 {
499 first_[I] = loop_->thread_[I];
500 overflow = true;
501 }
502 index_[I] = first_[I];
503 range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
504 return overflow;
505 }
506
507 template<size_t N>
508 ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
509 {
510 if constexpr(N == 0)
511 {
512 // overflow
513 return true;
514 }
515 else
516 {
517 if(not nth_strided_loop<N - 1>())
518 {
519 return false;
520 }
521 else
522 {
523 return do_strided_loops<N - 1>();
524 }
525 }
526 ALPAKA_UNREACHABLE(false);
527 }
528
529 // increment the iterator
530 ALPAKA_FN_ACC inline constexpr void increment()
531 {
532 // linear N-dimensional loops over the elements associated to the thread;
533 // do_elements_loops<>() returns true if any of those loops overflows
534 if(not do_elements_loops<Dim::value>())
535 {
536 // the elements loops did not overflow, return the next index
537 return;
538 }
539
540 // strided N-dimensional loop over the threads in the kernel launch grid;
541 // do_strided_loops<>() returns true if any of those loops overflows
542 if(not do_strided_loops<Dim::value>())
543 {
544 // the strided loops did not overflow, return the next index
545 return;
546 }
547
548 // the iterator has reached or passed the end of the extent, clamp it to the extent
549 first_ = loop_->extent_;
550 range_ = loop_->extent_;
551 index_ = loop_->extent_;
552 }
553
554 // const pointer to the UniformElementsND that the iterator refers to
555 UniformElementsND const* loop_;
556
557 // modified by the pre/post-increment operator
558 Vec first_; // first element processed by this thread
559 Vec range_; // last element processed by this thread
560 Vec index_; // current element processed by this thread
561 };
562
563 private:
564 Vec const elements_;
565 Vec const thread_;
566 Vec const stride_;
567 Vec const extent_;
568 };
569
570 } // namespace detail
571
572 /* uniformElementsND
573 *
574 * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
575 */
576
577 template<concepts::Acc TAcc>
578 requires(alpaka::Dim<TAcc>::value > 0)
579 ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
580 {
581 return detail::UniformElementsND<TAcc>(acc);
582 }
583
584 template<concepts::Acc TAcc>
585 requires(alpaka::Dim<TAcc>::value > 0)
587 TAcc const& acc,
589 {
590 return detail::UniformElementsND<TAcc>(acc, extent);
591 }
592
593 namespace detail
594 {
595
596 /* UniformGroupsAlong
597 *
598 * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
599 * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
600 * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
601 * kernel grid size along the `Dim` dimension.
602 *
603 * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
604 * infer the accelerator type from the argument.
605 *
606 * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
607 * ...)`.
608 *
609 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
610 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
611 * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
612 * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
613 *
614 * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
615 * threads in a block see the same loop iterations, while threads in different blocks may see a different
616 * number of iterations. If the work division has more blocks than the required number of groups, the first
617 * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
618 * work division has less blocks than the required number of groups, some of the blocks will perform more than
619 * one iteration, in order to cover then whole problem space.
620 *
621 * If the problem size is not a multiple of the block size, the last group will process a number of elements
622 * smaller than the block size. However, also in this case all threads in the block will execute the same
623 * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
624 * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
625 * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
626 *
627 * For example, if the block size is 64 and there are 400 elements
628 *
629 * for (auto group: uniformGroupsAlong<Dim>(acc, 400)
630 *
631 * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
632 * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
633 * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
634 * last group; it is up to the inner loop to not process the non-existing elements after 399.
635 *
636 * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
637 * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
638 * 0 to 6 will process one group while block 7 will no process any.
639 *
640 * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
641 * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
642 * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
643 * and block 3 will process group 3.
644 *
645 * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
646 * `uniformGroupElementsAlong<Dim>`.
647 */
648
649 template<concepts::Acc TAcc, std::size_t Dim>
650 requires(alpaka::Dim<TAcc>::value >= Dim)
651 class UniformGroupsAlong
652 {
653 public:
654 using Idx = alpaka::Idx<TAcc>;
655
656 ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
657 : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
658 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
659 , extent_{stride_}
660 {
661 }
662
663 // extent is the total number of elements (not blocks)
664 ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
665 : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
666 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
667 , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
668 {
669 }
670
671 class const_iterator;
672 using iterator = const_iterator;
673
674 ALPAKA_FN_ACC inline const_iterator begin() const
675 {
676 return const_iterator(stride_, extent_, first_);
677 }
678
679 ALPAKA_FN_ACC inline const_iterator end() const
680 {
681 return const_iterator(stride_, extent_, extent_);
682 }
683
685 {
686 friend class UniformGroupsAlong;
687
688 ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
689 : stride_{stride}
690 , extent_{extent}
691 , first_{std::min(first, extent)}
692 {
693 }
694
695 public:
696 ALPAKA_FN_ACC inline Idx operator*() const
697 {
698 return first_;
699 }
700
701 // pre-increment the iterator
703 {
704 // increment the first-element-in-block index by the grid stride
705 first_ += stride_;
706 if(first_ < extent_)
707 return *this;
708
709 // the iterator has reached or passed the end of the extent, clamp it to the extent
710 first_ = extent_;
711 return *this;
712 }
713
714 // post-increment the iterator
716 {
717 const_iterator old = *this;
718 ++(*this);
719 return old;
720 }
721
722 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
723 {
724 return (first_ == other.first_);
725 }
726
727 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
728 {
729 return not(*this == other);
730 }
731
732 private:
733 // non-const to support iterator copy and assignment
734 Idx stride_;
735 Idx extent_;
736 // modified by the pre/post-increment operator
737 Idx first_;
738 };
739
740 private:
741 Idx const first_;
742 Idx const stride_;
743 Idx const extent_;
744 };
745
746 } // namespace detail
747
748 /* uniformGroups
749 *
750 * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
751 * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
752 * across all groups; if not specified, it defaults to the kernel grid size.
753 *
754 * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
755 *
756 * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
757 * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
758 * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
759 * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
760 * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
761 * problem space.
762 *
763 * If the problem size is not a multiple of the block size, the last group will process a number of elements
764 * smaller than the block size. However, also in this case all threads in the block will execute the same number of
765 * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
766 * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
767 * implemented by `uniformGroupElements(acc, group, elements)`.
768 *
769 * For example, if the block size is 64 and there are 400 elements
770 *
771 * for (auto group: uniformGroups(acc, 400)
772 *
773 * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
774 * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
775 * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
776 * it is up to the inner loop to not process the non-existing elements after 399.
777 *
778 * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
779 * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
780 * will process one group while block 7 will no process any.
781 *
782 * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
783 * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
784 * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
785 * 3 will process group 3.
786 *
787 * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
788 *
789 * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
790 * use
791 * - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
792 * - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
793 * along the fastest, second-fastest, or third-fastest dimension.
794 */
795
796 template<concepts::Acc TAcc, typename... TArgs>
797 requires(alpaka::Dim<TAcc>::value == 1)
798 ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
799 {
800 using Idx = alpaka::Idx<TAcc>;
801 return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
802 }
803
804 /* uniformGroupsAlong<Dim>
805 *
806 * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
807 * can infer the accelerator type from the argument.
808 */
809
810 template<std::size_t Dim, concepts::Acc TAcc, typename... TArgs>
811 requires(alpaka::Dim<TAcc>::value >= Dim)
812 ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
813 {
814 using Idx = alpaka::Idx<TAcc>;
815 return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
816 }
817
818 /* uniformGroupsAlongX, Y, Z
819 *
820 * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
821 * dimensions.
822 */
823
824 template<concepts::Acc TAcc, typename... TArgs>
825 requires(alpaka::Dim<TAcc>::value > 0)
826 ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
827 {
828 using Idx = alpaka::Idx<TAcc>;
829 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
830 }
831
832 template<concepts::Acc TAcc, typename... TArgs>
833 requires(alpaka::Dim<TAcc>::value > 1)
834 ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
835 {
836 using Idx = alpaka::Idx<TAcc>;
837 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
838 }
839
840 template<concepts::Acc TAcc, typename... TArgs>
841 requires(alpaka::Dim<TAcc>::value > 2)
842 ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
843 {
844 using Idx = alpaka::Idx<TAcc>;
845 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
846 }
847
848 namespace detail
849 {
850
851 /* UniformGroupElementsAlong
852 *
853 * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
854 * spans all the elements within the given `group` along dimension `Dim`, as obtained from
855 * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
856 * across all groups; if not specified, it defaults to the kernel grid size.
857 *
858 * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
859 * ...)` that can infer the accelerator type from the argument.
860 *
861 * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
862 * `UniformGroupElementsAlong<0>(acc, ...)`.
863 *
864 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
865 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
866 * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
867 * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
868 *
869 * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
870 * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
871 * (excluded), while the local index spans the range from 0 to the block size (excluded).
872 *
873 * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
874 * the global element index reaches `elements`.
875 *
876 * If the problem size is not a multiple of the block size, different threads may execute a different number of
877 * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
878 * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
879 * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
880 * `uniformGroupElementsAlong<Dim>`.
881 *
882 * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
883 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
884 * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
885 * check the element index explicitly inside the loop:
886 *
887 * for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
888 * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
889 * // do some work and compute a result flag only for the valid elements
890 * flag = do_some_work();
891 * }
892 * // check if any valid element had a positive result
893 * if (alpaka::warp::any(acc, flag)) {
894 * // ...
895 * }
896 * }
897 *
898 * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
899 * `N-1`.
900 */
901
902 template<concepts::Acc TAcc, std::size_t Dim>
903 requires(alpaka::Dim<TAcc>::value >= Dim)
904 class UniformGroupElementsAlong
905 {
906 public:
907 using Idx = alpaka::Idx<TAcc>;
908
909 ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
910 : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
911 , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
912 , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
913 {
914 }
915
916 ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
917 : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
918 , local_{std::min(
919 extent - first_,
920 alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
921 * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
922 , range_{
923 std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
924 {
925 }
926
927 class const_iterator;
928 using iterator = const_iterator;
929
930 ALPAKA_FN_ACC inline const_iterator begin() const
931 {
932 return const_iterator(local_, first_, range_);
933 }
934
935 ALPAKA_FN_ACC inline const_iterator end() const
936 {
937 return const_iterator(range_, first_, range_);
938 }
939
941 {
942 friend class UniformGroupElementsAlong;
943
944 ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
945 : index_{local}
946 , first_{first}
947 , range_{range}
948 {
949 }
950
951 public:
953 {
954 return ElementIndex<Idx>{index_ + first_, index_};
955 }
956
957 // pre-increment the iterator
959 {
960 // increment the index along the elements processed by the current thread
961 ++index_;
962 if(index_ < range_)
963 return *this;
964
965 // the iterator has reached or passed the end of the extent, clamp it to the extent
966 index_ = range_;
967 return *this;
968 }
969
970 // post-increment the iterator
972 {
973 const_iterator old = *this;
974 ++(*this);
975 return old;
976 }
977
978 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
979 {
980 return (index_ == other.index_);
981 }
982
983 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
984 {
985 return not(*this == other);
986 }
987
988 private:
989 // modified by the pre/post-increment operator
990 Idx index_;
991 // non-const to support iterator copy and assignment
992 Idx first_;
993 Idx range_;
994 };
995
996 private:
997 Idx const first_;
998 Idx const local_;
999 Idx const range_;
1000 };
1001
1002 } // namespace detail
1003
1004 /* uniformGroupElements
1005 *
1006 * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
1007 * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
1008 * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
1009 *
1010 * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
1011 *
1012 * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
1013 * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
1014 * while the local index spans the range from 0 to the block size (excluded).
1015 *
1016 * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
1017 * global element index reaches `elements`.
1018 *
1019 * If the problem size is not a multiple of the block size, different threads may execute a different number of
1020 * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
1021 * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
1022 * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
1023 *
1024 * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
1025 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
1026 * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
1027 * element index explicitly inside the loop:
1028 *
1029 * for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
1030 * bool flag = false;
1031 * if (element < elements) {
1032 * // do some work and compute a result flag only for the valid elements
1033 * flag = do_some_work();
1034 * }
1035 * // check if any valid element had a positive result
1036 * if (alpaka::warp::any(acc, flag)) {
1037 * // ...
1038 * }
1039 * }
1040 *
1041 * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
1042 * kernels, use
1043 * - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
1044 * `Dim`;
1045 * - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
1046 * `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
1047 * dimension.
1048 */
1049
1050 template<concepts::Acc TAcc, typename... TArgs>
1051 requires(alpaka::Dim<TAcc>::value == 1)
1052 ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
1053 {
1054 using Idx = alpaka::Idx<TAcc>;
1055 return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1056 }
1057
1058 /* uniformGroupElementsAlong<Dim>
1059 *
1060 * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
1061 * Dim>(acc, ...)` that can infer the accelerator type from the argument.
1062 */
1063
1064 template<std::size_t Dim, concepts::Acc TAcc, typename... TArgs>
1065 requires(alpaka::Dim<TAcc>::value >= Dim)
1066 ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
1067 {
1068 using Idx = alpaka::Idx<TAcc>;
1069 return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1070 }
1071
1072 /* uniformGroupElementsAlongX, Y, Z
1073 *
1074 * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1075 * dimensions.
1076 */
1077
1078 template<concepts::Acc TAcc, typename... TArgs>
1079 requires(alpaka::Dim<TAcc>::value > 0)
1080 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
1081 {
1082 using Idx = alpaka::Idx<TAcc>;
1083 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1084 }
1085
1086 template<concepts::Acc TAcc, typename... TArgs>
1087 requires(alpaka::Dim<TAcc>::value > 1)
1088 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
1089 {
1090 using Idx = alpaka::Idx<TAcc>;
1091 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1092 }
1093
1094 template<concepts::Acc TAcc, typename... TArgs>
1095 requires(alpaka::Dim<TAcc>::value > 2)
1096 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
1097 {
1098 using Idx = alpaka::Idx<TAcc>;
1099 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1100 }
1101
1102} // namespace alpaka
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx first, Idx extent)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc)
ALPAKA_FN_ACC constexpr const_iterator operator++(int)
ALPAKA_FN_ACC constexpr bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC constexpr const_iterator operator++()
ALPAKA_FN_ACC constexpr bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC ElementIndex< Idx > operator*() const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition Common.hpp:38
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition Utility.hpp:27
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition Traits.hpp:1280
ALPAKA_FN_HOST auto end(TView &view) -> Iterator< TView >
Definition Iterator.hpp:133
ALPAKA_FN_HOST auto begin(TView &view) -> Iterator< TView >
Definition Iterator.hpp:127
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TWorkDiv > >
Get the extent requested.
Definition Traits.hpp:33
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getIdx(TIdx const &idx, TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TIdx > >
Get the indices requested.
Definition Accessors.hpp:23
ALPAKA_FN_ACC auto uniformGroupsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsND(TAcc const &acc)
ALPAKA_FN_ACC auto uniformElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition Vec.hpp:554
ALPAKA_FN_ACC auto uniformGroupElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_FN_ACC auto uniformGroupsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroups(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlong(TAcc const &acc, TArgs... args)
STL namespace.