alpaka
Abstraction Library for Parallel Kernel Acceleration
UniformElements.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include "alpaka/acc/Traits.hpp"
7 
8 #include <algorithm>
9 #include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
10 #include <cstddef>
11 #include <type_traits>
12 
13 namespace alpaka
14 {
15 
16  namespace detail
17  {
18 
19  /* UniformElementsAlong
20  *
21  * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
22  * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
23  * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
24  * size along the `Dim` dimension.
25  *
26  * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
27  * can infer the accelerator type from the argument.
28  *
29  * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
30  * 0>(acc, ...)`.
31  *
32  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
33  * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
34  * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
35  * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
36  *
37  * To cover the problem space, different threads may execute a different number of iterations. As a result, it
38  * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
39  * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
40  * loop over each group's elements, and synchronise only in the outer loop:
41  *
42  * for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
43  * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
44  * // first part of the computation
45  * // no synchronisations here
46  * ...
47  * }
48  * // wait for all threads to complete the first part
49  * alpaka::syncBlockThreads();
50  * for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
51  * // second part of the computation
52  * // no synchronisations here
53  * ...
54  * }
55  * // wait for all threads to complete the second part
56  * alpaka::syncBlockThreads();
57  * ...
58  * }
59  *
60  * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
61  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
62  * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
63  * check the element index explicitly inside the loop:
64  *
65  * for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
66  * bool flag = false;
67  * if (element < extent) {
68  * // do some work and compute a result flag only for the valid elements
69  * flag = do_some_work();
70  * }
71  * // check if any valid element had a positive result
72  * if (alpaka::warp::any(acc, flag)) {
73  * // ...
74  * }
75  * }
76  *
77  * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
78  * `N-1`.
79  */
80 
81  template<
82  typename TAcc,
83  std::size_t Dim,
84  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
86  {
87  public:
89 
90  ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
91  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
92  , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
93  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
94  , extent_{stride_}
95  {
96  }
97 
98  ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
99  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
100  , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
101  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
102  , extent_{extent}
103  {
104  }
105 
106  ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
107  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
108  , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
109  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
110  , extent_{extent}
111  {
112  }
113 
114  class const_iterator;
116 
118  {
119  return const_iterator(elements_, stride_, extent_, first_);
120  }
121 
123  {
124  return const_iterator(elements_, stride_, extent_, extent_);
125  }
126 
128  {
129  friend class UniformElementsAlong;
130 
131  ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
132  : elements_{elements}
133  ,
134  // we need to reduce the stride by on element range because index_ is later increased with each
135  // increment
136  stride_{stride - elements}
137  , extent_{extent}
138  , index_{std::min(first, extent)}
139  {
140  }
141 
142  public:
143  ALPAKA_FN_ACC inline Idx operator*() const
144  {
145  return index_;
146  }
147 
148  // pre-increment the iterator
150  {
151  // increment the index along the elements processed by the current thread
152  ++indexElem_;
153  ++index_;
154  if(indexElem_ >= elements_)
155  {
156  indexElem_ = 0;
157  index_ += stride_;
158  }
159  if(index_ >= extent_)
160  index_ = extent_;
161 
162  return *this;
163  }
164 
165  // post-increment the iterator
167  {
168  const_iterator old = *this;
169  ++(*this);
170  return old;
171  }
172 
173  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
174  {
175  return (*(*this) == *other);
176  }
177 
178  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
179  {
180  return not(*this == other);
181  }
182 
183  private:
184  // non-const to support iterator copy and assignment
185  Idx elements_;
186  Idx stride_;
187  Idx extent_;
188  // modified by the pre/post-increment operator
189  Idx index_;
190  Idx indexElem_ = 0;
191  };
192 
193  private:
194  Idx const elements_;
195  Idx const first_;
196  Idx const stride_;
197  Idx const extent_;
198  };
199 
200  } // namespace detail
201 
202  /* uniformElements
203  *
204  * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
205  * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
206  * `extent` is not specified, it defaults to the kernel grid size.
207  *
208  * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
209  *
210  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
211  * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
212  * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
213  * over each group's elements, and synchronise only in the outer loop:
214  *
215  * for (auto group : uniformGroups(acc, extent)) {
216  * for (auto element : uniformGroupElements(acc, group, extent)) {
217  * // first part of the computation
218  * // no synchronisations here
219  * ...
220  * }
221  * // wait for all threads to complete the first part
222  * alpaka::syncBlockThreads();
223  * for (auto element : uniformGroupElements(acc, group, extent)) {
224  * // second part of the computation
225  * // no synchronisations here
226  * ...
227  * }
228  * // wait for all threads to complete the second part
229  * alpaka::syncBlockThreads();
230  * ...
231  * }
232  *
233  * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
234  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
235  * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
236  * element index explicitly inside the loop:
237  *
238  * for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
239  * bool flag = false;
240  * if (element < extent) {
241  * // do some work and compute a result flag only for elements up to extent
242  * flag = do_some_work();
243  * }
244  * // check if any valid element had a positive result
245  * if (alpaka::warp::any(acc, flag)) {
246  * // ...
247  * }
248  * }
249  *
250  * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
251  * use
252  * - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
253  * - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
254  * - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
255  * to loop along the fastest, second-fastest, or third-fastest dimension.
256  */
257 
258  template<
259  typename TAcc,
260  typename... TArgs,
261  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
262  ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
263  {
264  using Idx = alpaka::Idx<TAcc>;
265  return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
266  }
267 
268  /* uniformElementsAlong<Dim>
269  *
270  * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
271  * that can infer the accelerator type from the argument.
272  */
273 
274  template<
275  std::size_t Dim,
276  typename TAcc,
277  typename... TArgs,
278  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
279  ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
280  {
281  using Idx = alpaka::Idx<TAcc>;
282  return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
283  }
284 
285  /* uniformElementsAlongX, Y, Z
286  *
287  * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
288  * dimensions.
289  */
290 
291  template<
292  typename TAcc,
293  typename... TArgs,
294  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
295  ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
296  {
297  using Idx = alpaka::Idx<TAcc>;
298  return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
299  }
300 
301  template<
302  typename TAcc,
303  typename... TArgs,
304  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
305  ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
306  {
307  using Idx = alpaka::Idx<TAcc>;
308  return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
309  }
310 
311  template<
312  typename TAcc,
313  typename... TArgs,
314  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
315  ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
316  {
317  using Idx = alpaka::Idx<TAcc>;
318  return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
319  }
320 
321  namespace detail
322  {
323 
324  /* UniformElementsND
325  *
326  * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
327  * required to cover the given problem size, indicated by `extent`.
328  *
329  * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
330  *
331  * To cover the problem space, different threads may execute a different number of iterations. As a result, it
332  * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
333  * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
334  * loop over each group's elements, and synchronise only in the outer loop:
335  *
336  * for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
337  * for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
338  * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
339  * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
340  * // first part of the computation
341  * // no synchronisations here
342  * ...
343  * }
344  * }
345  * // wait for all threads to complete the first part
346  * alpaka::syncBlockThreads();
347  * for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
348  * for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
349  * // second part of the computation
350  * // no synchronisations here
351  * ...
352  * }
353  * }
354  * // wait for all threads to complete the second part
355  * alpaka::syncBlockThreads();
356  * ...
357  * }
358  * }
359  *
360  * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
361  */
362 
363  template<
364  typename TAcc,
365  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
367  {
368  public:
372 
373  ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
374  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
375  , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
376  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
377  , extent_{stride_}
378  {
379  }
380 
381  ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
382  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
383  , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
384  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
385  , extent_{extent}
386  {
387  }
388 
389  // tag used to construct an end iterator
390  struct at_end_t
391  {
392  };
393 
394  class const_iterator;
396 
398  {
399  // check that all dimensions of the current thread index are within the extent
400  if((thread_ < extent_).all())
401  {
402  // construct an iterator pointing to the first element to be processed by the current thread
403  return const_iterator{this, thread_};
404  }
405  else
406  {
407  // construct an end iterator, pointing post the end of the extent
408  return const_iterator{this, at_end_t{}};
409  }
410  }
411 
413  {
414  // construct an end iterator, pointing post the end of the extent
415  return const_iterator{this, at_end_t{}};
416  }
417 
419  {
420  friend class UniformElementsND;
421 
422  public:
423  ALPAKA_FN_ACC inline Vec operator*() const
424  {
425  return index_;
426  }
427 
428  // pre-increment the iterator
430  {
431  increment();
432  return *this;
433  }
434 
435  // post-increment the iterator
437  {
438  const_iterator old = *this;
439  increment();
440  return old;
441  }
442 
443  ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
444  {
445  return (index_ == other.index_);
446  }
447 
448  ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
449  {
450  return not(*this == other);
451  }
452 
453  private:
454  // construct an iterator pointing to the first element to be processed by the current thread
455  ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
456  : loop_{loop}
457  , first_{alpaka::elementwise_min(first, loop->extent_)}
458  , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
459  , index_{first_}
460  {
461  }
462 
463  // construct an end iterator, pointing post the end of the extent
464  ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
465  : loop_{loop}
466  , first_{loop_->extent_}
467  , range_{loop_->extent_}
468  , index_{loop_->extent_}
469  {
470  }
471 
472  template<size_t I>
473  ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
474  {
475  bool overflow = false;
476  ++index_[I];
477  if(index_[I] >= range_[I])
478  {
479  index_[I] = first_[I];
480  overflow = true;
481  }
482  return overflow;
483  }
484 
485  template<size_t N>
486  ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
487  {
488  if constexpr(N == 0)
489  {
490  // overflow
491  return true;
492  }
493  else
494  {
495  if(not nth_elements_loop<N - 1>())
496  {
497  return false;
498  }
499  else
500  {
501  return do_elements_loops<N - 1>();
502  }
503  }
504  ALPAKA_UNREACHABLE(false);
505  }
506 
507  template<size_t I>
508  ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
509  {
510  bool overflow = false;
511  first_[I] += loop_->stride_[I];
512  if(first_[I] >= loop_->extent_[I])
513  {
514  first_[I] = loop_->thread_[I];
515  overflow = true;
516  }
517  index_[I] = first_[I];
518  range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
519  return overflow;
520  }
521 
522  template<size_t N>
523  ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
524  {
525  if constexpr(N == 0)
526  {
527  // overflow
528  return true;
529  }
530  else
531  {
532  if(not nth_strided_loop<N - 1>())
533  {
534  return false;
535  }
536  else
537  {
538  return do_strided_loops<N - 1>();
539  }
540  }
541  ALPAKA_UNREACHABLE(false);
542  }
543 
544  // increment the iterator
545  ALPAKA_FN_ACC inline constexpr void increment()
546  {
547  // linear N-dimensional loops over the elements associated to the thread;
548  // do_elements_loops<>() returns true if any of those loops overflows
549  if(not do_elements_loops<Dim::value>())
550  {
551  // the elements loops did not overflow, return the next index
552  return;
553  }
554 
555  // strided N-dimensional loop over the threads in the kernel launch grid;
556  // do_strided_loops<>() returns true if any of those loops overflows
557  if(not do_strided_loops<Dim::value>())
558  {
559  // the strided loops did not overflow, return the next index
560  return;
561  }
562 
563  // the iterator has reached or passed the end of the extent, clamp it to the extent
564  first_ = loop_->extent_;
565  range_ = loop_->extent_;
566  index_ = loop_->extent_;
567  }
568 
569  // const pointer to the UniformElementsND that the iterator refers to
570  UniformElementsND const* loop_;
571 
572  // modified by the pre/post-increment operator
573  Vec first_; // first element processed by this thread
574  Vec range_; // last element processed by this thread
575  Vec index_; // current element processed by this thread
576  };
577 
578  private:
579  Vec const elements_;
580  Vec const thread_;
581  Vec const stride_;
582  Vec const extent_;
583  };
584 
585  } // namespace detail
586 
587  /* uniformElementsND
588  *
589  * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
590  */
591 
592  template<
593  typename TAcc,
594  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
595  ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
596  {
598  }
599 
600  template<
601  typename TAcc,
602  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
604  TAcc const& acc,
606  {
607  return detail::UniformElementsND<TAcc>(acc, extent);
608  }
609 
610  namespace detail
611  {
612 
613  /* UniformGroupsAlong
614  *
615  * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
616  * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
617  * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
618  * kernel grid size along the `Dim` dimension.
619  *
620  * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
621  * infer the accelerator type from the argument.
622  *
623  * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
624  * ...)`.
625  *
626  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
627  * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
628  * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
629  * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
630  *
631  * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
632  * threads in a block see the same loop iterations, while threads in different blocks may see a different
633  * number of iterations. If the work division has more blocks than the required number of groups, the first
634  * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
635  * work division has less blocks than the required number of groups, some of the blocks will perform more than
636  * one iteration, in order to cover then whole problem space.
637  *
638  * If the problem size is not a multiple of the block size, the last group will process a number of elements
639  * smaller than the block size. However, also in this case all threads in the block will execute the same
640  * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
641  * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
642  * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
643  *
644  * For example, if the block size is 64 and there are 400 elements
645  *
646  * for (auto group: uniformGroupsAlong<Dim>(acc, 400)
647  *
648  * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
649  * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
650  * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
651  * last group; it is up to the inner loop to not process the non-existing elements after 399.
652  *
653  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
654  * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
655  * 0 to 6 will process one group while block 7 will no process any.
656  *
657  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
658  * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
659  * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
660  * and block 3 will process group 3.
661  *
662  * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
663  * `uniformGroupElementsAlong<Dim>`.
664  */
665 
666  template<
667  typename TAcc,
668  std::size_t Dim,
669  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
671  {
672  public:
674 
675  ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
676  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
677  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
678  , extent_{stride_}
679  {
680  }
681 
682  // extent is the total number of elements (not blocks)
683  ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
684  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
685  , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
686  , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
687  {
688  }
689 
690  class const_iterator;
692 
694  {
695  return const_iterator(stride_, extent_, first_);
696  }
697 
699  {
700  return const_iterator(stride_, extent_, extent_);
701  }
702 
704  {
705  friend class UniformGroupsAlong;
706 
707  ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
708  : stride_{stride}
709  , extent_{extent}
710  , first_{std::min(first, extent)}
711  {
712  }
713 
714  public:
715  ALPAKA_FN_ACC inline Idx operator*() const
716  {
717  return first_;
718  }
719 
720  // pre-increment the iterator
722  {
723  // increment the first-element-in-block index by the grid stride
724  first_ += stride_;
725  if(first_ < extent_)
726  return *this;
727 
728  // the iterator has reached or passed the end of the extent, clamp it to the extent
729  first_ = extent_;
730  return *this;
731  }
732 
733  // post-increment the iterator
735  {
736  const_iterator old = *this;
737  ++(*this);
738  return old;
739  }
740 
741  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
742  {
743  return (first_ == other.first_);
744  }
745 
746  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
747  {
748  return not(*this == other);
749  }
750 
751  private:
752  // non-const to support iterator copy and assignment
753  Idx stride_;
754  Idx extent_;
755  // modified by the pre/post-increment operator
756  Idx first_;
757  };
758 
759  private:
760  Idx const first_;
761  Idx const stride_;
762  Idx const extent_;
763  };
764 
765  } // namespace detail
766 
767  /* uniformGroups
768  *
769  * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
770  * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
771  * across all groups; if not specified, it defaults to the kernel grid size.
772  *
773  * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
774  *
775  * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
776  * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
777  * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
778  * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
779  * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
780  * problem space.
781  *
782  * If the problem size is not a multiple of the block size, the last group will process a number of elements
783  * smaller than the block size. However, also in this case all threads in the block will execute the same number of
784  * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
785  * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
786  * implemented by `uniformGroupElements(acc, group, elements)`.
787  *
788  * For example, if the block size is 64 and there are 400 elements
789  *
790  * for (auto group: uniformGroups(acc, 400)
791  *
792  * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
793  * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
794  * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
795  * it is up to the inner loop to not process the non-existing elements after 399.
796  *
797  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
798  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
799  * will process one group while block 7 will no process any.
800  *
801  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
802  * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
803  * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
804  * 3 will process group 3.
805  *
806  * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
807  *
808  * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
809  * use
810  * - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
811  * - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
812  * along the fastest, second-fastest, or third-fastest dimension.
813  */
814 
815  template<
816  typename TAcc,
817  typename... TArgs,
818  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
819  ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
820  {
821  using Idx = alpaka::Idx<TAcc>;
822  return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
823  }
824 
825  /* uniformGroupsAlong<Dim>
826  *
827  * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
828  * can infer the accelerator type from the argument.
829  */
830 
831  template<
832  std::size_t Dim,
833  typename TAcc,
834  typename... TArgs,
835  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
836  ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
837  {
838  using Idx = alpaka::Idx<TAcc>;
839  return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
840  }
841 
842  /* uniformGroupsAlongX, Y, Z
843  *
844  * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
845  * dimensions.
846  */
847 
848  template<
849  typename TAcc,
850  typename... TArgs,
851  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
852  ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
853  {
854  using Idx = alpaka::Idx<TAcc>;
855  return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
856  }
857 
858  template<
859  typename TAcc,
860  typename... TArgs,
861  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
862  ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
863  {
864  using Idx = alpaka::Idx<TAcc>;
865  return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
866  }
867 
868  template<
869  typename TAcc,
870  typename... TArgs,
871  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
872  ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
873  {
874  using Idx = alpaka::Idx<TAcc>;
875  return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
876  }
877 
878  namespace detail
879  {
880 
881  /* UniformGroupElementsAlong
882  *
883  * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
884  * spans all the elements within the given `group` along dimension `Dim`, as obtained from
885  * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
886  * across all groups; if not specified, it defaults to the kernel grid size.
887  *
888  * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
889  * ...)` that can infer the accelerator type from the argument.
890  *
891  * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
892  * `UniformGroupElementsAlong<0>(acc, ...)`.
893  *
894  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
895  * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
896  * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
897  * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
898  *
899  * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
900  * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
901  * (excluded), while the local index spans the range from 0 to the block size (excluded).
902  *
903  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
904  * the global element index reaches `elements`.
905  *
906  * If the problem size is not a multiple of the block size, different threads may execute a different number of
907  * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
908  * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
909  * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
910  * `uniformGroupElementsAlong<Dim>`.
911  *
912  * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
913  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
914  * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
915  * check the element index explicitly inside the loop:
916  *
917  * for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
918  * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
919  * // do some work and compute a result flag only for the valid elements
920  * flag = do_some_work();
921  * }
922  * // check if any valid element had a positive result
923  * if (alpaka::warp::any(acc, flag)) {
924  * // ...
925  * }
926  * }
927  *
928  * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
929  * `N-1`.
930  */
931 
932  template<
933  typename TAcc,
934  std::size_t Dim,
935  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
937  {
938  public:
940 
941  ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
942  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
943  , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
944  , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
945  {
946  }
947 
948  ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
949  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
950  , local_{std::min(
951  extent - first_,
952  alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
953  * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
954  , range_{
955  std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
956  {
957  }
958 
959  class const_iterator;
961 
963  {
964  return const_iterator(local_, first_, range_);
965  }
966 
968  {
969  return const_iterator(range_, first_, range_);
970  }
971 
973  {
975 
976  ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
977  : index_{local}
978  , first_{first}
979  , range_{range}
980  {
981  }
982 
983  public:
985  {
986  return ElementIndex<Idx>{index_ + first_, index_};
987  }
988 
989  // pre-increment the iterator
991  {
992  // increment the index along the elements processed by the current thread
993  ++index_;
994  if(index_ < range_)
995  return *this;
996 
997  // the iterator has reached or passed the end of the extent, clamp it to the extent
998  index_ = range_;
999  return *this;
1000  }
1001 
1002  // post-increment the iterator
1004  {
1005  const_iterator old = *this;
1006  ++(*this);
1007  return old;
1008  }
1009 
1010  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
1011  {
1012  return (index_ == other.index_);
1013  }
1014 
1015  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
1016  {
1017  return not(*this == other);
1018  }
1019 
1020  private:
1021  // modified by the pre/post-increment operator
1022  Idx index_;
1023  // non-const to support iterator copy and assignment
1024  Idx first_;
1025  Idx range_;
1026  };
1027 
1028  private:
1029  Idx const first_;
1030  Idx const local_;
1031  Idx const range_;
1032  };
1033 
1034  } // namespace detail
1035 
1036  /* uniformGroupElements
1037  *
1038  * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
1039  * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
1040  * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
1041  *
1042  * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
1043  *
1044  * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
1045  * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
1046  * while the local index spans the range from 0 to the block size (excluded).
1047  *
1048  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
1049  * global element index reaches `elements`.
1050  *
1051  * If the problem size is not a multiple of the block size, different threads may execute a different number of
1052  * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
1053  * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
1054  * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
1055  *
1056  * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
1057  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
1058  * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
1059  * element index explicitly inside the loop:
1060  *
1061  * for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
1062  * bool flag = false;
1063  * if (element < elements) {
1064  * // do some work and compute a result flag only for the valid elements
1065  * flag = do_some_work();
1066  * }
1067  * // check if any valid element had a positive result
1068  * if (alpaka::warp::any(acc, flag)) {
1069  * // ...
1070  * }
1071  * }
1072  *
1073  * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
1074  * kernels, use
1075  * - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
1076  * `Dim`;
1077  * - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
1078  * `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
1079  * dimension.
1080  */
1081 
1082  template<
1083  typename TAcc,
1084  typename... TArgs,
1085  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1086  ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
1087  {
1088  using Idx = alpaka::Idx<TAcc>;
1089  return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1090  }
1091 
1092  /* uniformGroupElementsAlong<Dim>
1093  *
1094  * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
1095  * Dim>(acc, ...)` that can infer the accelerator type from the argument.
1096  */
1097 
1098  template<
1099  std::size_t Dim,
1100  typename TAcc,
1101  typename... TArgs,
1102  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1103  ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
1104  {
1105  using Idx = alpaka::Idx<TAcc>;
1106  return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1107  }
1108 
1109  /* uniformGroupElementsAlongX, Y, Z
1110  *
1111  * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1112  * dimensions.
1113  */
1114 
1115  template<
1116  typename TAcc,
1117  typename... TArgs,
1118  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1119  ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
1120  {
1121  using Idx = alpaka::Idx<TAcc>;
1122  return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1123  }
1124 
1125  template<
1126  typename TAcc,
1127  typename... TArgs,
1128  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1129  ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
1130  {
1131  using Idx = alpaka::Idx<TAcc>;
1132  return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1133  }
1134 
1135  template<
1136  typename TAcc,
1137  typename... TArgs,
1138  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1139  ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
1140  {
1141  using Idx = alpaka::Idx<TAcc>;
1142  return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1143  }
1144 
1145 } // namespace alpaka
#define ALPAKA_UNREACHABLE(...)
Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches....
Definition: Unreachable.hpp:24
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx first, Idx extent)
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC const_iterator end() const
constexpr ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
constexpr ALPAKA_FN_ACC const_iterator operator++(int)
constexpr ALPAKA_FN_ACC const_iterator operator++()
constexpr ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC UniformElementsND(TAcc const &acc)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC UniformElementsND(TAcc const &acc, Vec extent)
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC ElementIndex< Idx > operator*() const
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC UniformGroupElementsAlong(TAcc const &acc, Idx block)
ALPAKA_FN_ACC UniformGroupElementsAlong(TAcc const &acc, Idx block, Idx extent)
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC UniformGroupsAlong(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC UniformGroupsAlong(TAcc const &acc)
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition: Common.hpp:38
constexpr ALPAKA_FN_HOST_ACC auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition: Utility.hpp:27
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition: Traits.hpp:1280
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition: Traits.hpp:114
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition: Traits.hpp:29
ALPAKA_FN_ACC auto uniformElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TWorkDiv >>
Get the extent requested.
Definition: Traits.hpp:33
ALPAKA_FN_ACC auto uniformGroups(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlong(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsND(TAcc const &acc)
ALPAKA_NO_HOST_ACC_WARNING constexpr ALPAKA_FN_HOST_ACC auto elementwise_min(Vec< TDim, TVal > const &p, Vecs const &... qs) -> Vec< TDim, TVal >
Definition: Vec.hpp:628
ALPAKA_FN_ACC auto uniformGroupsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongX(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElements(TAcc const &acc, TArgs... args)
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getIdx(TIdx const &idx, TWorkDiv const &workDiv) -> Vec< Dim< TWorkDiv >, Idx< TIdx >>
Get the indices requested.
Definition: Accessors.hpp:23
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_FN_ACC auto uniformElementsAlongZ(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformGroupElementsAlongY(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniformElementsAlong(TAcc const &acc, TArgs... args)