alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
UniformElements.hpp
Go to the documentation of this file.
1#pragma once
2
7
8#include <algorithm>
9#include <cstddef>
10#include <type_traits>
11
12namespace alpaka
13{
14
15 namespace detail
16 {
17
18 /* UniformElementsAlong
19 *
20 * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
21 * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
22 * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
23 * size along the `Dim` dimension.
24 *
25 * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
26 * can infer the accelerator type from the argument.
27 *
28 * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
29 * 0>(acc, ...)`.
30 *
31 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
32 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
33 * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
34 * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
35 *
36 * To cover the problem space, different threads may execute a different number of iterations. As a result, it
37 * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
38 * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
39 * loop over each group's elements, and synchronise only in the outer loop:
40 *
41 * for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
42 * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
43 * // first part of the computation
44 * // no synchronisations here
45 * ...
46 * }
47 * // wait for all threads to complete the first part
48 * alpaka::syncBlockThreads();
49 * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
50 * // second part of the computation
51 * // no synchronisations here
52 * ...
53 * }
54 * // wait for all threads to complete the second part
55 * alpaka::syncBlockThreads();
56 * ...
57 * }
58 *
59 * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
60 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
61 * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
62 * check the element index explicitly inside the loop:
63 *
64 * for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
65 * bool flag = false;
66 * if (element < extent) {
67 * // do some work and compute a result flag only for the valid elements
68 * flag = do_some_work();
69 * }
70 * // check if any valid element had a positive result
71 * if (alpaka::warp::any(acc, flag)) {
72 * // ...
73 * }
74 * }
75 *
76 * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
77 * `N-1`.
78 */
79
80 template<
81 typename TAcc,
82 std::size_t Dim,
83 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
85 {
86 public:
88
89 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
90 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
91 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
92 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
93 , extent_{stride_}
94 {
95 }
96
97 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
98 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
99 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
100 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
101 , extent_{extent}
102 {
103 }
104
105 ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
106 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
107 , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
108 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
109 , extent_{extent}
110 {
111 }
112
113 class const_iterator;
115
117 {
118 return const_iterator(elements_, stride_, extent_, first_);
119 }
120
122 {
123 return const_iterator(elements_, stride_, extent_, extent_);
124 }
125
127 {
129
130 ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
131 : elements_{elements}
132 ,
133 // we need to reduce the stride by on element range because index_ is later increased with each
134 // increment
135 stride_{stride - elements}
136 , extent_{extent}
137 , index_{std::min(first, extent)}
138 {
139 }
140
141 public:
143 {
144 return index_;
145 }
146
147 // pre-increment the iterator
149 {
150 // increment the index along the elements processed by the current thread
151 ++indexElem_;
152 ++index_;
153 if(indexElem_ >= elements_)
154 {
155 indexElem_ = 0;
156 index_ += stride_;
157 }
158 if(index_ >= extent_)
159 index_ = extent_;
160
161 return *this;
162 }
163
164 // post-increment the iterator
166 {
167 const_iterator old = *this;
168 ++(*this);
169 return old;
170 }
171
172 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
173 {
174 return (*(*this) == *other);
175 }
176
177 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
178 {
179 return not(*this == other);
180 }
181
182 private:
183 // non-const to support iterator copy and assignment
184 Idx elements_;
185 Idx stride_;
186 Idx extent_;
187 // modified by the pre/post-increment operator
188 Idx index_;
189 Idx indexElem_ = 0;
190 };
191
192 private:
193 Idx const elements_;
194 Idx const first_;
195 Idx const stride_;
196 Idx const extent_;
197 };
198
199 } // namespace detail
200
201 /* uniformElements
202 *
203 * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
204 * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
205 * `extent` is not specified, it defaults to the kernel grid size.
206 *
207 * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
208 *
209 * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
210 * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
211 * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
212 * over each group's elements, and synchronise only in the outer loop:
213 *
214 * for (auto group : uniformGroups(acc, extent)) {
215 * for (auto element : uniformGroupElements(acc, group, extent)) {
216 * // first part of the computation
217 * // no synchronisations here
218 * ...
219 * }
220 * // wait for all threads to complete the first part
221 * alpaka::syncBlockThreads();
222 * for (auto element : uniformGroupElements(acc, group, extent)) {
223 * // second part of the computation
224 * // no synchronisations here
225 * ...
226 * }
227 * // wait for all threads to complete the second part
228 * alpaka::syncBlockThreads();
229 * ...
230 * }
231 *
232 * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
233 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
234 * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
235 * element index explicitly inside the loop:
236 *
237 * for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
238 * bool flag = false;
239 * if (element < extent) {
240 * // do some work and compute a result flag only for elements up to extent
241 * flag = do_some_work();
242 * }
243 * // check if any valid element had a positive result
244 * if (alpaka::warp::any(acc, flag)) {
245 * // ...
246 * }
247 * }
248 *
249 * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
250 * use
251 * - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
252 * - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
253 * - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
254 * to loop along the fastest, second-fastest, or third-fastest dimension.
255 */
256
257 template<
258 typename TAcc,
259 typename... TArgs,
260 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
261 ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
262 {
263 using Idx = alpaka::Idx<TAcc>;
264 return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
265 }
266
267 /* uniformElementsAlong<Dim>
268 *
269 * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
270 * that can infer the accelerator type from the argument.
271 */
272
273 template<
274 std::size_t Dim,
275 typename TAcc,
276 typename... TArgs,
277 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
278 ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
279 {
280 using Idx = alpaka::Idx<TAcc>;
281 return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
282 }
283
284 /* uniformElementsAlongX, Y, Z
285 *
286 * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
287 * dimensions.
288 */
289
290 template<
291 typename TAcc,
292 typename... TArgs,
293 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
294 ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
295 {
296 using Idx = alpaka::Idx<TAcc>;
297 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
298 }
299
300 template<
301 typename TAcc,
302 typename... TArgs,
303 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
304 ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
305 {
306 using Idx = alpaka::Idx<TAcc>;
307 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
308 }
309
310 template<
311 typename TAcc,
312 typename... TArgs,
313 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
314 ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
315 {
316 using Idx = alpaka::Idx<TAcc>;
317 return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
318 }
319
320 namespace detail
321 {
322
323 /* UniformElementsND
324 *
325 * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
326 * required to cover the given problem size, indicated by `extent`.
327 *
328 * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
329 *
330 * To cover the problem space, different threads may execute a different number of iterations. As a result, it
331 * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
332 * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
333 * loop over each group's elements, and synchronise only in the outer loop:
334 *
335 * for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
336 * for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
337 * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
338 * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
339 * // first part of the computation
340 * // no synchronisations here
341 * ...
342 * }
343 * }
344 * // wait for all threads to complete the first part
345 * alpaka::syncBlockThreads();
346 * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
347 * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
348 * // second part of the computation
349 * // no synchronisations here
350 * ...
351 * }
352 * }
353 * // wait for all threads to complete the second part
354 * alpaka::syncBlockThreads();
355 * ...
356 * }
357 * }
358 *
359 * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
360 */
361
362 template<
363 typename TAcc,
364 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
365 class UniformElementsND
366 {
367 public:
368 using Dim = alpaka::Dim<TAcc>;
369 using Idx = alpaka::Idx<TAcc>;
370 using Vec = alpaka::Vec<Dim, Idx>;
371
372 ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
373 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
374 , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
375 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
376 , extent_{stride_}
377 {
378 }
379
380 ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
381 : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
382 , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
383 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
384 , extent_{extent}
385 {
386 }
387
388 // tag used to construct an end iterator
389 struct at_end_t
390 {
391 };
392
393 class const_iterator;
394 using iterator = const_iterator;
395
396 ALPAKA_FN_ACC inline const_iterator begin() const
397 {
398 // check that all dimensions of the current thread index are within the extent
399 if((thread_ < extent_).all())
400 {
401 // construct an iterator pointing to the first element to be processed by the current thread
402 return const_iterator{this, thread_};
403 }
404 else
405 {
406 // construct an end iterator, pointing post the end of the extent
407 return const_iterator{this, at_end_t{}};
408 }
409 }
410
411 ALPAKA_FN_ACC inline const_iterator end() const
412 {
413 // construct an end iterator, pointing post the end of the extent
414 return const_iterator{this, at_end_t{}};
415 }
416
418 {
419 friend class UniformElementsND;
420
421 public:
423 {
424 return index_;
425 }
426
427 // pre-increment the iterator
429 {
430 increment();
431 return *this;
432 }
433
434 // post-increment the iterator
436 {
437 const_iterator old = *this;
438 increment();
439 return old;
440 }
441
442 ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
443 {
444 return (index_ == other.index_);
445 }
446
447 ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
448 {
449 return not(*this == other);
450 }
451
452 private:
453 // construct an iterator pointing to the first element to be processed by the current thread
454 ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
455 : loop_{loop}
456 , first_{alpaka::elementwise_min(first, loop->extent_)}
457 , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
458 , index_{first_}
459 {
460 }
461
462 // construct an end iterator, pointing post the end of the extent
463 ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
464 : loop_{loop}
465 , first_{loop_->extent_}
466 , range_{loop_->extent_}
467 , index_{loop_->extent_}
468 {
469 }
470
471 template<size_t I>
472 ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
473 {
474 bool overflow = false;
475 ++index_[I];
476 if(index_[I] >= range_[I])
477 {
478 index_[I] = first_[I];
479 overflow = true;
480 }
481 return overflow;
482 }
483
484 template<size_t N>
485 ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
486 {
487 if constexpr(N == 0)
488 {
489 // overflow
490 return true;
491 }
492 else
493 {
494 if(not nth_elements_loop<N - 1>())
495 {
496 return false;
497 }
498 else
499 {
500 return do_elements_loops<N - 1>();
501 }
502 }
503 ALPAKA_UNREACHABLE(false);
504 }
505
506 template<size_t I>
507 ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
508 {
509 bool overflow = false;
510 first_[I] += loop_->stride_[I];
511 if(first_[I] >= loop_->extent_[I])
512 {
513 first_[I] = loop_->thread_[I];
514 overflow = true;
515 }
516 index_[I] = first_[I];
517 range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
518 return overflow;
519 }
520
521 template<size_t N>
522 ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
523 {
524 if constexpr(N == 0)
525 {
526 // overflow
527 return true;
528 }
529 else
530 {
531 if(not nth_strided_loop<N - 1>())
532 {
533 return false;
534 }
535 else
536 {
537 return do_strided_loops<N - 1>();
538 }
539 }
540 ALPAKA_UNREACHABLE(false);
541 }
542
543 // increment the iterator
544 ALPAKA_FN_ACC inline constexpr void increment()
545 {
546 // linear N-dimensional loops over the elements associated to the thread;
547 // do_elements_loops<>() returns true if any of those loops overflows
548 if(not do_elements_loops<Dim::value>())
549 {
550 // the elements loops did not overflow, return the next index
551 return;
552 }
553
554 // strided N-dimensional loop over the threads in the kernel launch grid;
555 // do_strided_loops<>() returns true if any of those loops overflows
556 if(not do_strided_loops<Dim::value>())
557 {
558 // the strided loops did not overflow, return the next index
559 return;
560 }
561
562 // the iterator has reached or passed the end of the extent, clamp it to the extent
563 first_ = loop_->extent_;
564 range_ = loop_->extent_;
565 index_ = loop_->extent_;
566 }
567
568 // const pointer to the UniformElementsND that the iterator refers to
569 UniformElementsND const* loop_;
570
571 // modified by the pre/post-increment operator
572 Vec first_; // first element processed by this thread
573 Vec range_; // last element processed by this thread
574 Vec index_; // current element processed by this thread
575 };
576
577 private:
578 Vec const elements_;
579 Vec const thread_;
580 Vec const stride_;
581 Vec const extent_;
582 };
583
584 } // namespace detail
585
586 /* uniformElementsND
587 *
588 * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
589 */
590
591 template<
592 typename TAcc,
593 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
594 ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
595 {
596 return detail::UniformElementsND<TAcc>(acc);
597 }
598
599 template<
600 typename TAcc,
601 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
603 TAcc const& acc,
605 {
606 return detail::UniformElementsND<TAcc>(acc, extent);
607 }
608
609 namespace detail
610 {
611
612 /* UniformGroupsAlong
613 *
614 * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
615 * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
616 * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
617 * kernel grid size along the `Dim` dimension.
618 *
619 * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
620 * infer the accelerator type from the argument.
621 *
622 * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
623 * ...)`.
624 *
625 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
626 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
627 * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
628 * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
629 *
630 * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
631 * threads in a block see the same loop iterations, while threads in different blocks may see a different
632 * number of iterations. If the work division has more blocks than the required number of groups, the first
633 * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
634 * work division has less blocks than the required number of groups, some of the blocks will perform more than
635 * one iteration, in order to cover then whole problem space.
636 *
637 * If the problem size is not a multiple of the block size, the last group will process a number of elements
638 * smaller than the block size. However, also in this case all threads in the block will execute the same
639 * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
640 * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
641 * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
642 *
643 * For example, if the block size is 64 and there are 400 elements
644 *
645 * for (auto group: uniformGroupsAlong<Dim>(acc, 400)
646 *
647 * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
648 * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
649 * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
650 * last group; it is up to the inner loop to not process the non-existing elements after 399.
651 *
652 * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
653 * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
654 * 0 to 6 will process one group while block 7 will no process any.
655 *
656 * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
657 * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
658 * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
659 * and block 3 will process group 3.
660 *
661 * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
662 * `uniformGroupElementsAlong<Dim>`.
663 */
664
665 template<
666 typename TAcc,
667 std::size_t Dim,
668 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
669 class UniformGroupsAlong
670 {
671 public:
672 using Idx = alpaka::Idx<TAcc>;
673
674 ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
675 : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
676 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
677 , extent_{stride_}
678 {
679 }
680
681 // extent is the total number of elements (not blocks)
682 ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
683 : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
684 , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
685 , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
686 {
687 }
688
689 class const_iterator;
690 using iterator = const_iterator;
691
692 ALPAKA_FN_ACC inline const_iterator begin() const
693 {
694 return const_iterator(stride_, extent_, first_);
695 }
696
697 ALPAKA_FN_ACC inline const_iterator end() const
698 {
699 return const_iterator(stride_, extent_, extent_);
700 }
701
703 {
704 friend class UniformGroupsAlong;
705
706 ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
707 : stride_{stride}
708 , extent_{extent}
709 , first_{std::min(first, extent)}
710 {
711 }
712
713 public:
714 ALPAKA_FN_ACC inline Idx operator*() const
715 {
716 return first_;
717 }
718
719 // pre-increment the iterator
721 {
722 // increment the first-element-in-block index by the grid stride
723 first_ += stride_;
724 if(first_ < extent_)
725 return *this;
726
727 // the iterator has reached or passed the end of the extent, clamp it to the extent
728 first_ = extent_;
729 return *this;
730 }
731
732 // post-increment the iterator
734 {
735 const_iterator old = *this;
736 ++(*this);
737 return old;
738 }
739
740 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
741 {
742 return (first_ == other.first_);
743 }
744
745 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
746 {
747 return not(*this == other);
748 }
749
750 private:
751 // non-const to support iterator copy and assignment
752 Idx stride_;
753 Idx extent_;
754 // modified by the pre/post-increment operator
755 Idx first_;
756 };
757
758 private:
759 Idx const first_;
760 Idx const stride_;
761 Idx const extent_;
762 };
763
764 } // namespace detail
765
766 /* uniformGroups
767 *
768 * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
769 * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
770 * across all groups; if not specified, it defaults to the kernel grid size.
771 *
772 * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
773 *
774 * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
775 * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
776 * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
777 * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
778 * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
779 * problem space.
780 *
781 * If the problem size is not a multiple of the block size, the last group will process a number of elements
782 * smaller than the block size. However, also in this case all threads in the block will execute the same number of
783 * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
784 * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
785 * implemented by `uniformGroupElements(acc, group, elements)`.
786 *
787 * For example, if the block size is 64 and there are 400 elements
788 *
789 * for (auto group: uniformGroups(acc, 400)
790 *
791 * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
792 * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
793 * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
794 * it is up to the inner loop to not process the non-existing elements after 399.
795 *
796 * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
797 * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
798 * will process one group while block 7 will no process any.
799 *
800 * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
801 * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
802 * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
803 * 3 will process group 3.
804 *
805 * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
806 *
807 * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
808 * use
809 * - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
810 * - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
811 * along the fastest, second-fastest, or third-fastest dimension.
812 */
813
814 template<
815 typename TAcc,
816 typename... TArgs,
817 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
818 ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
819 {
820 using Idx = alpaka::Idx<TAcc>;
821 return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
822 }
823
824 /* uniformGroupsAlong<Dim>
825 *
826 * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
827 * can infer the accelerator type from the argument.
828 */
829
830 template<
831 std::size_t Dim,
832 typename TAcc,
833 typename... TArgs,
834 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
835 ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
836 {
837 using Idx = alpaka::Idx<TAcc>;
838 return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
839 }
840
841 /* uniformGroupsAlongX, Y, Z
842 *
843 * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
844 * dimensions.
845 */
846
847 template<
848 typename TAcc,
849 typename... TArgs,
850 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
851 ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
852 {
853 using Idx = alpaka::Idx<TAcc>;
854 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
855 }
856
857 template<
858 typename TAcc,
859 typename... TArgs,
860 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
861 ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
862 {
863 using Idx = alpaka::Idx<TAcc>;
864 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
865 }
866
867 template<
868 typename TAcc,
869 typename... TArgs,
870 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
871 ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
872 {
873 using Idx = alpaka::Idx<TAcc>;
874 return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
875 }
876
877 namespace detail
878 {
879
880 /* UniformGroupElementsAlong
881 *
882 * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
883 * spans all the elements within the given `group` along dimension `Dim`, as obtained from
884 * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
885 * across all groups; if not specified, it defaults to the kernel grid size.
886 *
887 * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
888 * ...)` that can infer the accelerator type from the argument.
889 *
890 * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
891 * `UniformGroupElementsAlong<0>(acc, ...)`.
892 *
893 * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
894 * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
895 * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
896 * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
897 *
898 * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
899 * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
900 * (excluded), while the local index spans the range from 0 to the block size (excluded).
901 *
902 * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
903 * the global element index reaches `elements`.
904 *
905 * If the problem size is not a multiple of the block size, different threads may execute a different number of
906 * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
907 * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
908 * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
909 * `uniformGroupElementsAlong<Dim>`.
910 *
911 * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
912 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
913 * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
914 * check the element index explicitly inside the loop:
915 *
916 * for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
917 * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
918 * // do some work and compute a result flag only for the valid elements
919 * flag = do_some_work();
920 * }
921 * // check if any valid element had a positive result
922 * if (alpaka::warp::any(acc, flag)) {
923 * // ...
924 * }
925 * }
926 *
927 * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
928 * `N-1`.
929 */
930
931 template<
932 typename TAcc,
933 std::size_t Dim,
934 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
935 class UniformGroupElementsAlong
936 {
937 public:
938 using Idx = alpaka::Idx<TAcc>;
939
940 ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
941 : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
942 , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
943 , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
944 {
945 }
946
947 ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
948 : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
949 , local_{std::min(
950 extent - first_,
951 alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
952 * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
953 , range_{
954 std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
955 {
956 }
957
958 class const_iterator;
959 using iterator = const_iterator;
960
961 ALPAKA_FN_ACC inline const_iterator begin() const
962 {
963 return const_iterator(local_, first_, range_);
964 }
965
966 ALPAKA_FN_ACC inline const_iterator end() const
967 {
968 return const_iterator(range_, first_, range_);
969 }
970
972 {
974
975 ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
976 : index_{local}
977 , first_{first}
978 , range_{range}
979 {
980 }
981
982 public:
984 {
985 return ElementIndex<Idx>{index_ + first_, index_};
986 }
987
988 // pre-increment the iterator
990 {
991 // increment the index along the elements processed by the current thread
992 ++index_;
993 if(index_ < range_)
994 return *this;
995
996 // the iterator has reached or passed the end of the extent, clamp it to the extent
997 index_ = range_;
998 return *this;
999 }
1000
1001 // post-increment the iterator
1003 {
1004 const_iterator old = *this;
1005 ++(*this);
1006 return old;
1007 }
1008
1009 ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
1010 {
1011 return (index_ == other.index_);
1012 }
1013
1014 ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
1015 {
1016 return not(*this == other);
1017 }
1018
1019 private:
1020 // modified by the pre/post-increment operator
1021 Idx index_;
1022 // non-const to support iterator copy and assignment
1023 Idx first_;
1024 Idx range_;
1025 };
1026
1027 private:
1028 Idx const first_;
1029 Idx const local_;
1030 Idx const range_;
1031 };
1032
1033 } // namespace detail
1034
1035 /* uniformGroupElements
1036 *
1037 * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
1038 * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
1039 * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
1040 *
1041 * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
1042 *
1043 * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
1044 * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
1045 * while the local index spans the range from 0 to the block size (excluded).
1046 *
1047 * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
1048 * global element index reaches `elements`.
1049 *
1050 * If the problem size is not a multiple of the block size, different threads may execute a different number of
1051 * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
1052 * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
1053 * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
1054 *
1055 * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
1056 * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
1057 * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
1058 * element index explicitly inside the loop:
1059 *
1060 * for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
1061 * bool flag = false;
1062 * if (element < elements) {
1063 * // do some work and compute a result flag only for the valid elements
1064 * flag = do_some_work();
1065 * }
1066 * // check if any valid element had a positive result
1067 * if (alpaka::warp::any(acc, flag)) {
1068 * // ...
1069 * }
1070 * }
1071 *
1072 * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
1073 * kernels, use
1074 * - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
1075 * `Dim`;
1076 * - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
1077 * `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
1078 * dimension.
1079 */
1080
1081 template<
1082 typename TAcc,
1083 typename... TArgs,
1084 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1085 ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
1086 {
1087 using Idx = alpaka::Idx<TAcc>;
1088 return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1089 }
1090
1091 /* uniformGroupElementsAlong<Dim>
1092 *
1093 * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
1094 * Dim>(acc, ...)` that can infer the accelerator type from the argument.
1095 */
1096
1097 template<
1098 std::size_t Dim,
1099 typename TAcc,
1100 typename... TArgs,
1101 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1102 ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
1103 {
1104 using Idx = alpaka::Idx<TAcc>;
1105 return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1106 }
1107
1108 /* uniformGroupElementsAlongX, Y, Z
1109 *
1110 * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1111 * dimensions.
1112 */
1113
1114 template<
1115 typename TAcc,
1116 typename... TArgs,
1117 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1118 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
1119 {
1120 using Idx = alpaka::Idx<TAcc>;
1121 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1122 }
1123
1124 template<
1125 typename TAcc,
1126 typename... TArgs,
1127 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1128 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
1129 {
1130 using Idx = alpaka::Idx<TAcc>;
1131 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1132 }
1133
1134 template<
1135 typename TAcc,
1136 typename... TArgs,
1137 typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1138 ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
1139 {
1140 using Idx = alpaka::Idx<TAcc>;
1141 return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1142 }
1143
1144} // namespace alpaka
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx first, Idx extent)
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC constexpr bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC constexpr bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC constexpr const_iterator operator++()
ALPAKA_FN_ACC constexpr const_iterator operator++(int)
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC ElementIndex< Idx > operator*() const
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition Common.hpp:38
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition Utility.hpp:27
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition Traits.hpp:1280
ALPAKA_FN_HOST auto end(TView &view) -> Iterator< TView >
Definition Iterator.hpp:139
ALPAKA_FN_HOST auto begin(TView &view) -> Iterator< TView >
Definition Iterator.hpp:133
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_FN_ACC auto uniformElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TWorkDiv > >
Get the extent requested.
Definition Traits.hpp:33
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getIdx(TIdx const &idx, TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TIdx > >
Get the indices requested.
Definition Accessors.hpp:23
ALPAKA_FN_ACC auto uniformGroupElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroups(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsND(TAcc const &acc)
ALPAKA_FN_ACC auto uniformGroupsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition Vec.hpp:554
ALPAKA_FN_ACC auto uniformGroupElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_HOST_ACC Vec(TFirstIndex &&, TRestIndices &&...) -> Vec< DimInt< 1+sizeof...(TRestIndices)>, std::decay_t< TFirstIndex > >
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_FN_ACC auto uniformElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlong(TAcc const &acc, TArgs... args)
STL namespace.