alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
UniformElements.hpp
Go to the documentation of this file.
1#pragma once
2
7
8#include <algorithm>
9#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
10#include <cstddef>
11#include <type_traits>
12
13namespace alpaka
14{
15
16 namespace detail
17 {
18
19 /* UniformElementsAlong
20 *
21 * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
22 * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
23 * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
24 * size along the `Dim` dimension.
25 *
26 * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
27 * can infer the accelerator type from the argument.
28 *
29 * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
30 * 0>(acc, ...)`.
31 *
32 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
33 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
34 * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
35 * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
36 *
37 * To cover the problem space, different threads may execute a different number of iterations. As a result, it
38 * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
39 * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
40 * loop over each group's elements, and synchronise only in the outer loop:
41 *
42 * for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
43 * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
44 * // first part of the computation
45 * // no synchronisations here
46 * ...
47 * }
48 * // wait for all threads to complete the first part
49 * alpaka::syncBlockThreads();
50 * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
51 * // second part of the computation
52 * // no synchronisations here
53 * ...
54 * }
55 * // wait for all threads to complete the second part
56 * alpaka::syncBlockThreads();
57 * ...
58 * }
59 *
60 * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
61 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
62 * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
63 * check the element index explicitly inside the loop:
64 *
65 * for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
66 * bool flag = false;
67 * if (element < extent) {
68 * // do some work and compute a result flag only for the valid elements
69 * flag = do_some_work();
70 * }
71 * // check if any valid element had a positive result
72 * if (alpaka::warp::any(acc, flag)) {
73 * // ...
74 * }
75 * }
76 *
77 * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
78 * `N-1`.
79 */
80
81 template<
82 typename TAcc,
83 std::size_t Dim,
84 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
86 {
87 public:
89
90 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
91 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
92 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
93 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
94 , extent_{stride_}
95 {
96 }
97
98 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
99 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
100 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
101 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
102 , extent_{extent}
103 {
104 }
105
106 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
107 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
108 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
109 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
110 , extent_{extent}
111 {
112 }
113
114 class const_iterator;
116
118 {
119 return const_iterator(elements_, stride_, extent_, first_);
120 }
121
123 {
124 return const_iterator(elements_, stride_, extent_, extent_);
125 }
126
128 {
130
131 ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
132 : elements_{elements}
133 ,
134 // we need to reduce the stride by on element range because index_ is later increased with each
135 // increment
136 stride_{stride - elements}
137 , extent_{extent}
138 , index_{std::min(first, extent)}
139 {
140 }
141
142 public:
144 {
145 return index_;
146 }
147
148 // pre-increment the iterator
150 {
151 // increment the index along the elements processed by the current thread
152 ++indexElem_;
153 ++index_;
154 if(indexElem_ >= elements_)
155 {
156 indexElem_ = 0;
157 index_ += stride_;
158 }
159 if(index_ >= extent_)
160 index_ = extent_;
161
162 return *this;
163 }
164
165 // post-increment the iterator
167 {
168 const_iterator old = *this;
169 ++(*this);
170 return old;
171 }
172
173 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
174 {
175 return (*(*this) == *other);
176 }
177
178 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
179 {
180 return not(*this == other);
181 }
182
183 private:
184 // non-const to support iterator copy and assignment
185 Idx elements_;
186 Idx stride_;
187 Idx extent_;
188 // modified by the pre/post-increment operator
189 Idx index_;
190 Idx indexElem_ = 0;
191 };
192
193 private:
194 Idx const elements_;
195 Idx const first_;
196 Idx const stride_;
197 Idx const extent_;
198 };
199
200 } // namespace detail
201
202 /* uniformElements
203 *
204 * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
205 * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
206 * `extent` is not specified, it defaults to the kernel grid size.
207 *
208 * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
209 *
210 * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
211 * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
212 * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
213 * over each group's elements, and synchronise only in the outer loop:
214 *
215 * for (auto group : uniformGroups(acc, extent)) {
216 * for (auto element : uniformGroupElements(acc, group, extent)) {
217 * // first part of the computation
218 * // no synchronisations here
219 * ...
220 * }
221 * // wait for all threads to complete the first part
222 * alpaka::syncBlockThreads();
223 * for (auto element : uniformGroupElements(acc, group, extent)) {
224 * // second part of the computation
225 * // no synchronisations here
226 * ...
227 * }
228 * // wait for all threads to complete the second part
229 * alpaka::syncBlockThreads();
230 * ...
231 * }
232 *
233 * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
234 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
235 * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
236 * element index explicitly inside the loop:
237 *
238 * for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
239 * bool flag = false;
240 * if (element < extent) {
241 * // do some work and compute a result flag only for elements up to extent
242 * flag = do_some_work();
243 * }
244 * // check if any valid element had a positive result
245 * if (alpaka::warp::any(acc, flag)) {
246 * // ...
247 * }
248 * }
249 *
250 * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
251 * use
252 * - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
253 * - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
254 * - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
255 * to loop along the fastest, second-fastest, or third-fastest dimension.
256 */
257
258 template<
259 typename TAcc,
260 typename... TArgs,
261 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
262 ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
263 {
264 using Idx = alpaka::Idx<TAcc>;
265 return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
266 }
267
268 /* uniformElementsAlong<Dim>
269 *
270 * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
271 * that can infer the accelerator type from the argument.
272 */
273
274 template<
275 std::size_t Dim,
276 typename TAcc,
277 typename... TArgs,
278 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
279 ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
280 {
281 using Idx = alpaka::Idx<TAcc>;
282 return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
283 }
284
285 /* uniformElementsAlongX, Y, Z
286 *
287 * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
288 * dimensions.
289 */
290
291 template<
292 typename TAcc,
293 typename... TArgs,
294 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
295 ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
296 {
297 using Idx = alpaka::Idx<TAcc>;
298 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
299 }
300
301 template<
302 typename TAcc,
303 typename... TArgs,
304 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
305 ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
306 {
307 using Idx = alpaka::Idx<TAcc>;
308 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
309 }
310
311 template<
312 typename TAcc,
313 typename... TArgs,
314 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
315 ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
316 {
317 using Idx = alpaka::Idx<TAcc>;
318 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
319 }
320
321 namespace detail
322 {
323
324 /* UniformElementsND
325 *
326 * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
327 * required to cover the given problem size, indicated by `extent`.
328 *
329 * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
330 *
331 * To cover the problem space, different threads may execute a different number of iterations. As a result, it
332 * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
333 * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
334 * loop over each group's elements, and synchronise only in the outer loop:
335 *
336 * for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
337 * for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
338 * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
339 * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
340 * // first part of the computation
341 * // no synchronisations here
342 * ...
343 * }
344 * }
345 * // wait for all threads to complete the first part
346 * alpaka::syncBlockThreads();
347 * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
348 * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
349 * // second part of the computation
350 * // no synchronisations here
351 * ...
352 * }
353 * }
354 * // wait for all threads to complete the second part
355 * alpaka::syncBlockThreads();
356 * ...
357 * }
358 * }
359 *
360 * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
361 */
362
363 template<
364 typename TAcc,
365 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
366 class UniformElementsND
367 {
368 public:
369 using Dim = alpaka::Dim<TAcc>;
370 using Idx = alpaka::Idx<TAcc>;
371 using Vec = alpaka::Vec<Dim, Idx>;
372
373 ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
374 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
375 , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
376 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
377 , extent_{stride_}
378 {
379 }
380
381 ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
382 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
383 , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
384 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
385 , extent_{extent}
386 {
387 }
388
389 // tag used to construct an end iterator
390 struct at_end_t
391 {
392 };
393
394 class const_iterator;
395 using iterator = const_iterator;
396
397 ALPAKA_FN_ACC inline const_iterator begin() const
398 {
399 // check that all dimensions of the current thread index are within the extent
400 if((thread_ < extent_).all())
401 {
402 // construct an iterator pointing to the first element to be processed by the current thread
403 return const_iterator{this, thread_};
404 }
405 else
406 {
407 // construct an end iterator, pointing post the end of the extent
408 return const_iterator{this, at_end_t{}};
409 }
410 }
411
412 ALPAKA_FN_ACC inline const_iterator end() const
413 {
414 // construct an end iterator, pointing post the end of the extent
415 return const_iterator{this, at_end_t{}};
416 }
417
419 {
420 friend class UniformElementsND;
421
422 public:
424 {
425 return index_;
426 }
427
428 // pre-increment the iterator
430 {
431 increment();
432 return *this;
433 }
434
435 // post-increment the iterator
437 {
438 const_iterator old = *this;
439 increment();
440 return old;
441 }
442
443 ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
444 {
445 return (index_ == other.index_);
446 }
447
448 ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
449 {
450 return not(*this == other);
451 }
452
453 private:
454 // construct an iterator pointing to the first element to be processed by the current thread
455 ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
456 : loop_{loop}
457 , first_{alpaka::elementwise_min(first, loop->extent_)}
458 , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
459 , index_{first_}
460 {
461 }
462
463 // construct an end iterator, pointing post the end of the extent
464 ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
465 : loop_{loop}
466 , first_{loop_->extent_}
467 , range_{loop_->extent_}
468 , index_{loop_->extent_}
469 {
470 }
471
472 template<size_t I>
473 ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
474 {
475 bool overflow = false;
476 ++index_[I];
477 if(index_[I] >= range_[I])
478 {
479 index_[I] = first_[I];
480 overflow = true;
481 }
482 return overflow;
483 }
484
485 template<size_t N>
486 ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
487 {
488 if constexpr(N == 0)
489 {
490 // overflow
491 return true;
492 }
493 else
494 {
495 if(not nth_elements_loop<N - 1>())
496 {
497 return false;
498 }
499 else
500 {
501 return do_elements_loops<N - 1>();
502 }
503 }
504 ALPAKA_UNREACHABLE(false);
505 }
506
507 template<size_t I>
508 ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
509 {
510 bool overflow = false;
511 first_[I] += loop_->stride_[I];
512 if(first_[I] >= loop_->extent_[I])
513 {
514 first_[I] = loop_->thread_[I];
515 overflow = true;
516 }
517 index_[I] = first_[I];
518 range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
519 return overflow;
520 }
521
522 template<size_t N>
523 ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
524 {
525 if constexpr(N == 0)
526 {
527 // overflow
528 return true;
529 }
530 else
531 {
532 if(not nth_strided_loop<N - 1>())
533 {
534 return false;
535 }
536 else
537 {
538 return do_strided_loops<N - 1>();
539 }
540 }
541 ALPAKA_UNREACHABLE(false);
542 }
543
544 // increment the iterator
545 ALPAKA_FN_ACC inline constexpr void increment()
546 {
547 // linear N-dimensional loops over the elements associated to the thread;
548 // do_elements_loops<>() returns true if any of those loops overflows
549 if(not do_elements_loops<Dim::value>())
550 {
551 // the elements loops did not overflow, return the next index
552 return;
553 }
554
555 // strided N-dimensional loop over the threads in the kernel launch grid;
556 // do_strided_loops<>() returns true if any of those loops overflows
557 if(not do_strided_loops<Dim::value>())
558 {
559 // the strided loops did not overflow, return the next index
560 return;
561 }
562
563 // the iterator has reached or passed the end of the extent, clamp it to the extent
564 first_ = loop_->extent_;
565 range_ = loop_->extent_;
566 index_ = loop_->extent_;
567 }
568
569 // const pointer to the UniformElementsND that the iterator refers to
570 UniformElementsND const* loop_;
571
572 // modified by the pre/post-increment operator
573 Vec first_; // first element processed by this thread
574 Vec range_; // last element processed by this thread
575 Vec index_; // current element processed by this thread
576 };
577
578 private:
579 Vec const elements_;
580 Vec const thread_;
581 Vec const stride_;
582 Vec const extent_;
583 };
584
585 } // namespace detail
586
587 /* uniformElementsND
588 *
589 * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
590 */
591
592 template<
593 typename TAcc,
594 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
595 ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
596 {
597 return detail::UniformElementsND<TAcc>(acc);
598 }
599
600 template<
601 typename TAcc,
602 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
604 TAcc const& acc,
606 {
607 return detail::UniformElementsND<TAcc>(acc, extent);
608 }
609
610 namespace detail
611 {
612
613 /* UniformGroupsAlong
614 *
615 * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
616 * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
617 * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
618 * kernel grid size along the `Dim` dimension.
619 *
620 * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
621 * infer the accelerator type from the argument.
622 *
623 * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
624 * ...)`.
625 *
626 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
627 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
628 * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
629 * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
630 *
631 * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
632 * threads in a block see the same loop iterations, while threads in different blocks may see a different
633 * number of iterations. If the work division has more blocks than the required number of groups, the first
634 * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
635 * work division has less blocks than the required number of groups, some of the blocks will perform more than
636 * one iteration, in order to cover then whole problem space.
637 *
638 * If the problem size is not a multiple of the block size, the last group will process a number of elements
639 * smaller than the block size. However, also in this case all threads in the block will execute the same
640 * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
641 * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
642 * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
643 *
644 * For example, if the block size is 64 and there are 400 elements
645 *
646 * for (auto group: uniformGroupsAlong<Dim>(acc, 400)
647 *
648 * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
649 * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
650 * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
651 * last group; it is up to the inner loop to not process the non-existing elements after 399.
652 *
653 * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
654 * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
655 * 0 to 6 will process one group while block 7 will no process any.
656 *
657 * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
658 * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
659 * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
660 * and block 3 will process group 3.
661 *
662 * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
663 * `uniformGroupElementsAlong<Dim>`.
664 */
665
666 template<
667 typename TAcc,
668 std::size_t Dim,
669 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
670 class UniformGroupsAlong
671 {
672 public:
673 using Idx = alpaka::Idx<TAcc>;
674
675 ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
676 : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
677 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
678 , extent_{stride_}
679 {
680 }
681
682 // extent is the total number of elements (not blocks)
683 ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
684 : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
685 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
686 , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
687 {
688 }
689
690 class const_iterator;
691 using iterator = const_iterator;
692
693 ALPAKA_FN_ACC inline const_iterator begin() const
694 {
695 return const_iterator(stride_, extent_, first_);
696 }
697
698 ALPAKA_FN_ACC inline const_iterator end() const
699 {
700 return const_iterator(stride_, extent_, extent_);
701 }
702
704 {
705 friend class UniformGroupsAlong;
706
707 ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
708 : stride_{stride}
709 , extent_{extent}
710 , first_{std::min(first, extent)}
711 {
712 }
713
714 public:
715 ALPAKA_FN_ACC inline Idx operator*() const
716 {
717 return first_;
718 }
719
720 // pre-increment the iterator
722 {
723 // increment the first-element-in-block index by the grid stride
724 first_ += stride_;
725 if(first_ < extent_)
726 return *this;
727
728 // the iterator has reached or passed the end of the extent, clamp it to the extent
729 first_ = extent_;
730 return *this;
731 }
732
733 // post-increment the iterator
735 {
736 const_iterator old = *this;
737 ++(*this);
738 return old;
739 }
740
741 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
742 {
743 return (first_ == other.first_);
744 }
745
746 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
747 {
748 return not(*this == other);
749 }
750
751 private:
752 // non-const to support iterator copy and assignment
753 Idx stride_;
754 Idx extent_;
755 // modified by the pre/post-increment operator
756 Idx first_;
757 };
758
759 private:
760 Idx const first_;
761 Idx const stride_;
762 Idx const extent_;
763 };
764
765 } // namespace detail
766
767 /* uniformGroups
768 *
769 * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
770 * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
771 * across all groups; if not specified, it defaults to the kernel grid size.
772 *
773 * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
774 *
775 * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
776 * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
777 * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
778 * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
779 * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
780 * problem space.
781 *
782 * If the problem size is not a multiple of the block size, the last group will process a number of elements
783 * smaller than the block size. However, also in this case all threads in the block will execute the same number of
784 * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
785 * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
786 * implemented by `uniformGroupElements(acc, group, elements)`.
787 *
788 * For example, if the block size is 64 and there are 400 elements
789 *
790 * for (auto group: uniformGroups(acc, 400)
791 *
792 * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
793 * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
794 * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
795 * it is up to the inner loop to not process the non-existing elements after 399.
796 *
797 * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
798 * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
799 * will process one group while block 7 will no process any.
800 *
801 * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
802 * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
803 * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
804 * 3 will process group 3.
805 *
806 * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
807 *
808 * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
809 * use
810 * - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
811 * - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
812 * along the fastest, second-fastest, or third-fastest dimension.
813 */
814
815 template<
816 typename TAcc,
817 typename... TArgs,
818 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
819 ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
820 {
821 using Idx = alpaka::Idx<TAcc>;
822 return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
823 }
824
825 /* uniformGroupsAlong<Dim>
826 *
827 * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
828 * can infer the accelerator type from the argument.
829 */
830
831 template<
832 std::size_t Dim,
833 typename TAcc,
834 typename... TArgs,
835 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
836 ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
837 {
838 using Idx = alpaka::Idx<TAcc>;
839 return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
840 }
841
842 /* uniformGroupsAlongX, Y, Z
843 *
844 * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
845 * dimensions.
846 */
847
848 template<
849 typename TAcc,
850 typename... TArgs,
851 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
852 ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
853 {
854 using Idx = alpaka::Idx<TAcc>;
855 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
856 }
857
858 template<
859 typename TAcc,
860 typename... TArgs,
861 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
862 ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
863 {
864 using Idx = alpaka::Idx<TAcc>;
865 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
866 }
867
868 template<
869 typename TAcc,
870 typename... TArgs,
871 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
872 ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
873 {
874 using Idx = alpaka::Idx<TAcc>;
875 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
876 }
877
878 namespace detail
879 {
880
881 /* UniformGroupElementsAlong
882 *
883 * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
884 * spans all the elements within the given `group` along dimension `Dim`, as obtained from
885 * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
886 * across all groups; if not specified, it defaults to the kernel grid size.
887 *
888 * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
889 * ...)` that can infer the accelerator type from the argument.
890 *
891 * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
892 * `UniformGroupElementsAlong<0>(acc, ...)`.
893 *
894 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
895 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
896 * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
897 * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
898 *
899 * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
900 * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
901 * (excluded), while the local index spans the range from 0 to the block size (excluded).
902 *
903 * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
904 * the global element index reaches `elements`.
905 *
906 * If the problem size is not a multiple of the block size, different threads may execute a different number of
907 * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
908 * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
909 * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
910 * `uniformGroupElementsAlong<Dim>`.
911 *
912 * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
913 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
914 * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
915 * check the element index explicitly inside the loop:
916 *
917 * for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
918 * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
919 * // do some work and compute a result flag only for the valid elements
920 * flag = do_some_work();
921 * }
922 * // check if any valid element had a positive result
923 * if (alpaka::warp::any(acc, flag)) {
924 * // ...
925 * }
926 * }
927 *
928 * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
929 * `N-1`.
930 */
931
932 template<
933 typename TAcc,
934 std::size_t Dim,
935 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
936 class UniformGroupElementsAlong
937 {
938 public:
939 using Idx = alpaka::Idx<TAcc>;
940
941 ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
942 : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
943 , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
944 , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
945 {
946 }
947
948 ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
949 : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
950 , local_{std::min(
951 extent - first_,
952 alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
953 * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
954 , range_{
955 std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
956 {
957 }
958
959 class const_iterator;
960 using iterator = const_iterator;
961
962 ALPAKA_FN_ACC inline const_iterator begin() const
963 {
964 return const_iterator(local_, first_, range_);
965 }
966
967 ALPAKA_FN_ACC inline const_iterator end() const
968 {
969 return const_iterator(range_, first_, range_);
970 }
971
973 {
975
976 ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
977 : index_{local}
978 , first_{first}
979 , range_{range}
980 {
981 }
982
983 public:
985 {
986 return ElementIndex<Idx>{index_ + first_, index_};
987 }
988
989 // pre-increment the iterator
991 {
992 // increment the index along the elements processed by the current thread
993 ++index_;
994 if(index_ < range_)
995 return *this;
996
997 // the iterator has reached or passed the end of the extent, clamp it to the extent
998 index_ = range_;
999 return *this;
1000 }
1001
1002 // post-increment the iterator
1004 {
1005 const_iterator old = *this;
1006 ++(*this);
1007 return old;
1008 }
1009
1010 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
1011 {
1012 return (index_ == other.index_);
1013 }
1014
1015 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
1016 {
1017 return not(*this == other);
1018 }
1019
1020 private:
1021 // modified by the pre/post-increment operator
1022 Idx index_;
1023 // non-const to support iterator copy and assignment
1024 Idx first_;
1025 Idx range_;
1026 };
1027
1028 private:
1029 Idx const first_;
1030 Idx const local_;
1031 Idx const range_;
1032 };
1033
1034 } // namespace detail
1035
1036 /* uniformGroupElements
1037 *
1038 * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
1039 * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
1040 * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
1041 *
1042 * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
1043 *
1044 * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
1045 * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
1046 * while the local index spans the range from 0 to the block size (excluded).
1047 *
1048 * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
1049 * global element index reaches `elements`.
1050 *
1051 * If the problem size is not a multiple of the block size, different threads may execute a different number of
1052 * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
1053 * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
1054 * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
1055 *
1056 * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
1057 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
1058 * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
1059 * element index explicitly inside the loop:
1060 *
1061 * for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
1062 * bool flag = false;
1063 * if (element < elements) {
1064 * // do some work and compute a result flag only for the valid elements
1065 * flag = do_some_work();
1066 * }
1067 * // check if any valid element had a positive result
1068 * if (alpaka::warp::any(acc, flag)) {
1069 * // ...
1070 * }
1071 * }
1072 *
1073 * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
1074 * kernels, use
1075 * - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
1076 * `Dim`;
1077 * - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
1078 * `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
1079 * dimension.
1080 */
1081
1082 template<
1083 typename TAcc,
1084 typename... TArgs,
1085 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1086 ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
1087 {
1088 using Idx = alpaka::Idx<TAcc>;
1089 return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1090 }
1091
1092 /* uniformGroupElementsAlong<Dim>
1093 *
1094 * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
1095 * Dim>(acc, ...)` that can infer the accelerator type from the argument.
1096 */
1097
1098 template<
1099 std::size_t Dim,
1100 typename TAcc,
1101 typename... TArgs,
1102 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1103 ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
1104 {
1105 using Idx = alpaka::Idx<TAcc>;
1106 return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1107 }
1108
1109 /* uniformGroupElementsAlongX, Y, Z
1110 *
1111 * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1112 * dimensions.
1113 */
1114
1115 template<
1116 typename TAcc,
1117 typename... TArgs,
1118 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1119 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
1120 {
1121 using Idx = alpaka::Idx<TAcc>;
1122 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1123 }
1124
1125 template<
1126 typename TAcc,
1127 typename... TArgs,
1128 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1129 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
1130 {
1131 using Idx = alpaka::Idx<TAcc>;
1132 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1133 }
1134
1135 template<
1136 typename TAcc,
1137 typename... TArgs,
1138 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1139 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
1140 {
1141 using Idx = alpaka::Idx<TAcc>;
1142 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1143 }
1144
1145} // namespace alpaka
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx first, Idx extent)
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC constexpr bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC constexpr bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC constexpr const_iterator operator++()
ALPAKA_FN_ACC constexpr const_iterator operator++(int)
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC ElementIndex< Idx > operator*() const
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition Common.hpp:38
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition Utility.hpp:27
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition Traits.hpp:1280
ALPAKA_FN_HOST auto end(TView &view) -> Iterator< TView >
Definition Iterator.hpp:139
ALPAKA_FN_HOST auto begin(TView &view) -> Iterator< TView >
Definition Iterator.hpp:133
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_FN_ACC auto uniformElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TWorkDiv > >
Get the extent requested.
Definition Traits.hpp:33
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getIdx(TIdx const &idx, TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TIdx > >
Get the indices requested.
Definition Accessors.hpp:23
ALPAKA_FN_ACC auto uniformGroupElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroups(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsND(TAcc const &acc)
ALPAKA_FN_ACC auto uniformGroupsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition Vec.hpp:541
ALPAKA_FN_ACC auto uniformGroupElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_FN_ACC auto uniformElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlong(TAcc const &acc, TArgs... args)
STL namespace.