alpaka
Abstraction Library for Parallel Kernel Acceleration
WarpGenericSycl.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
2  * SPDX-License-Identifier: MPL-2.0
3  *
4  * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
5  * from Intel DPCT.
6  * Copyright (C) Intel Corporation.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  * See https://llvm.org/LICENSE.txt for license information.
9  */
10 
11 #pragma once
12 
13 #include "alpaka/core/Assert.hpp"
14 #include "alpaka/warp/Traits.hpp"
15 
16 #include <cstdint>
17 
18 #ifdef ALPAKA_ACC_SYCL_ENABLED
19 
20 # include <sycl/sycl.hpp>
21 
22 namespace alpaka::warp
23 {
24  //! The SYCL warp.
25  template<typename TDim>
26  class WarpGenericSycl : public interface::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
27  {
28  public:
29  WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
30  {
31  }
32 
33  sycl::nd_item<TDim::value> m_item_warp;
34  };
35 } // namespace alpaka::warp
36 
37 namespace alpaka::warp::trait
38 {
39  template<typename TDim>
40  struct GetSize<warp::WarpGenericSycl<TDim>>
41  {
42  static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
43  {
44  auto const sub_group = warp.m_item_warp.get_sub_group();
45  // SYCL sub-groups are always 1D
46  return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
47  }
48  };
49 
50  template<typename TDim>
51  struct Activemask<warp::WarpGenericSycl<TDim>>
52  {
53  // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
54  // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
55  // Restrict to warpSize <= 32 for now.
56  static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
57  {
58  static_assert(!sizeof(warp), "activemask is not supported on SYCL");
59  // SYCL does not have an API to get the activemask. It is also questionable (to me, bgruber) whether an
60  // "activemask" even exists on some hardware architectures, since the idea is bound to threads being
61  // "turned off" when they take different control flow in a warp. A SYCL implementation could run each
62  // thread as a SIMD lane, in which cause the "thread" is always active, but some SIMD lanes are either
63  // predicated off, or side-effects are masked out when writing them back.
64  //
65  // An implementation via oneAPI's sycl::ext::oneapi::group_ballot causes UB, because activemask is expected
66  // to be callable when less than all threads are active in a warp (CUDA). But SYCL requires all threads of
67  // a group to call the function.
68  //
69  // Intel's CUDA -> SYCL migration tool also suggests that there is no direct equivalent and the user must
70  // rewrite their kernel logic. See also:
71  // https://oneapi-src.github.io/SYCLomatic/dev_guide/diagnostic_ref/dpct1086.html
72 
73  return ~std::uint32_t{0};
74  }
75  };
76 
77  template<typename TDim>
78  struct All<warp::WarpGenericSycl<TDim>>
79  {
80  static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
81  {
82  auto const sub_group = warp.m_item_warp.get_sub_group();
83  return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
84  }
85  };
86 
87  template<typename TDim>
88  struct Any<warp::WarpGenericSycl<TDim>>
89  {
90  static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
91  {
92  auto const sub_group = warp.m_item_warp.get_sub_group();
93  return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
94  }
95  };
96 
97  template<typename TDim>
98  struct Ballot<warp::WarpGenericSycl<TDim>>
99  {
100  // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
101  // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
102  // Restrict to warpSize <= 32 for now.
103  static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
104  {
105  auto const sub_group = warp.m_item_warp.get_sub_group();
106  auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
107  // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
108  // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
109  // Restrict to warpSize <= 32 for now.
110  std::uint32_t bits = 0;
111  mask.extract_bits(bits);
112  return bits;
113  }
114  };
115 
116  template<typename TDim>
117  struct Shfl<warp::WarpGenericSycl<TDim>>
118  {
119  template<typename T>
120  static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
121  {
122  ALPAKA_ASSERT_ACC(width > 0);
123  ALPAKA_ASSERT_ACC(srcLane >= 0);
124 
125  /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
126  subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
127 
128  Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
129  The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
130  first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
131  auto const actual_group = warp.m_item_warp.get_sub_group();
132  std::uint32_t const w = static_cast<std::uint32_t>(width);
133  std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
134  return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
135  }
136  };
137 
138  template<typename TDim>
139  struct ShflUp<warp::WarpGenericSycl<TDim>>
140  {
141  template<typename T>
142  static auto shfl_up(
143  warp::WarpGenericSycl<TDim> const& warp,
144  T value,
145  std::uint32_t offset, /* must be the same for all work-items in the group */
146  std::int32_t width)
147  {
148  auto const actual_group = warp.m_item_warp.get_sub_group();
149  std::uint32_t const w = static_cast<std::uint32_t>(width);
150  std::uint32_t const id = actual_group.get_local_linear_id();
151  std::uint32_t const start_index = id / w * w;
152  T result = sycl::shift_group_right(actual_group, value, offset);
153  if((id - start_index) < offset)
154  {
155  result = value;
156  }
157  return result;
158  }
159  };
160 
161  template<typename TDim>
162  struct ShflDown<warp::WarpGenericSycl<TDim>>
163  {
164  template<typename T>
165  static auto shfl_down(
166  warp::WarpGenericSycl<TDim> const& warp,
167  T value,
168  std::uint32_t offset,
169  std::int32_t width)
170  {
171  auto const actual_group = warp.m_item_warp.get_sub_group();
172  std::uint32_t const w = static_cast<std::uint32_t>(width);
173  std::uint32_t const id = actual_group.get_local_linear_id();
174  std::uint32_t const end_index = (id / w + 1) * w;
175  T result = sycl::shift_group_left(actual_group, value, offset);
176  if((id + offset) >= end_index)
177  {
178  result = value;
179  }
180  return result;
181  }
182  };
183 
184  template<typename TDim>
185  struct ShflXor<warp::WarpGenericSycl<TDim>>
186  {
187  template<typename T>
188  static auto shfl_xor(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t mask, std::int32_t width)
189  {
190  auto const actual_group = warp.m_item_warp.get_sub_group();
191  std::uint32_t const w = static_cast<std::uint32_t>(width);
192  std::uint32_t const id = actual_group.get_local_linear_id();
193  std::uint32_t const start_index = id / w * w;
194  std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
195  return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
196  }
197  };
198 } // namespace alpaka::warp::trait
199 
200 #endif
#define ALPAKA_ASSERT_ACC(...)
ALPAKA_ASSERT_ACC is an assert-like macro.
Definition: Assert.hpp:52
constexpr auto offset
Definition: Extent.hpp:34
The warp traits.
Definition: Traits.hpp:21
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition: Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
Definition: Traits.hpp:164
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
Definition: Traits.hpp:236
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
Definition: Traits.hpp:137
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
Definition: Traits.hpp:274
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp >>::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
Definition: Traits.hpp:90
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
Definition: Traits.hpp:65
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
Definition: Traits.hpp:312
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Definition: Traits.hpp:198