alpaka
Abstraction Library for Parallel Kernel Acceleration
WarpUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
9 #include "alpaka/warp/Traits.hpp"
10 
11 #include <cstdint>
12 
13 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
14 
15 namespace alpaka::warp
16 {
17  //! The GPU CUDA/HIP warp.
18  class WarpUniformCudaHipBuiltIn : public interface::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
19  {
20  };
21 
22 # if !defined(ALPAKA_HOST_ONLY)
23 
24 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
25 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
26 # endif
27 
28 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
29 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
30 # endif
31 
32  namespace trait
33  {
34  template<>
36  {
37  __device__ static auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
38  {
39  return warpSize;
40  }
41  };
42 
43  template<>
45  {
46  __device__ static auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
47 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
48  -> std::uint32_t
49 # else
50  -> std::uint64_t
51 # endif
52  {
53 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
54  return __activemask();
55 # else
56  // No HIP intrinsic for it, emulate via ballot
57  return __ballot(1);
58 # endif
59  }
60  };
61 
62  template<>
64  {
65  __device__ static auto all(
66  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
67  std::int32_t predicate) -> std::int32_t
68  {
69 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
70  return __all_sync(0xffff'ffff, predicate);
71 # else
72  return __all(predicate);
73 # endif
74  }
75  };
76 
77  template<>
79  {
80  __device__ static auto any(
81  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
82  std::int32_t predicate) -> std::int32_t
83  {
84 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
85  return __any_sync(0xffff'ffff, predicate);
86 # else
87  return __any(predicate);
88 # endif
89  }
90  };
91 
92  template<>
94  {
95  __device__ static auto ballot(
96  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
97  std::int32_t predicate)
98  // return type is required by the compiler
99 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
100  -> std::uint32_t
101 # else
102  -> std::uint64_t
103 # endif
104  {
105 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
106  return __ballot_sync(0xffff'ffff, predicate);
107 # else
108  return __ballot(predicate);
109 # endif
110  }
111  };
112 
113  template<>
115  {
116  template<typename T>
117  __device__ static auto shfl(
118  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
119  T val,
120  int srcLane,
121  std::int32_t width) -> T
122  {
123 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
124  return __shfl_sync(0xffff'ffff, val, srcLane, width);
125 # else
126  return __shfl(val, srcLane, width);
127 # endif
128  }
129  };
130 
131  template<>
133  {
134  template<typename T>
135  __device__ static auto shfl_up(
136  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
137  T val,
138  std::uint32_t offset,
139  std::int32_t width) -> T
140  {
141 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
142  return __shfl_up_sync(0xffff'ffff, val, offset, width);
143 # else
144  return __shfl_up(val, offset, width);
145 # endif
146  }
147  };
148 
149  template<>
151  {
152  template<typename T>
153  __device__ static auto shfl_down(
154  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
155  T val,
156  std::uint32_t offset,
157  std::int32_t width) -> T
158  {
159 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
160  return __shfl_down_sync(0xffff'ffff, val, offset, width);
161 # else
162  return __shfl_down(val, offset, width);
163 # endif
164  }
165  };
166 
167  template<>
169  {
170  template<typename T>
171  __device__ static auto shfl_xor(
172  [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
173  T val,
174  std::int32_t mask,
175  std::int32_t width) -> T
176  {
177 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
178  return __shfl_xor_sync(0xffff'ffff, val, mask, width);
179 # else
180  return __shfl_xor(val, mask, width);
181 # endif
182  }
183  };
184 
185  } // namespace trait
186 # endif
187 } // namespace alpaka::warp
188 
189 #endif
constexpr auto offset
Definition: Extent.hpp:34
constexpr std::uint32_t warpSize
This is a shortcut for the trait defined above.
Definition: Traits.hpp:109
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition: Interface.hpp:15
static __device__ auto activemask(warp::WarpUniformCudaHipBuiltIn const &) -> std::uint32_t
The active mask trait.
Definition: Traits.hpp:56
static __device__ auto all([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, std::int32_t predicate) -> std::int32_t
The all warp vote trait.
Definition: Traits.hpp:28
static __device__ auto any([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, std::int32_t predicate) -> std::int32_t
The any warp vote trait.
Definition: Traits.hpp:32
static __device__ auto ballot([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, std::int32_t predicate) -> std::uint32_t
The ballot warp vote trait.
Definition: Traits.hpp:36
static __device__ auto getSize(warp::WarpUniformCudaHipBuiltIn const &) -> std::int32_t
The warp size trait.
Definition: Traits.hpp:24
static __device__ auto shfl_down([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, T val, std::uint32_t offset, std::int32_t width) -> T
The shfl down warp swizzling trait.
Definition: Traits.hpp:48
static __device__ auto shfl_up([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, T val, std::uint32_t offset, std::int32_t width) -> T
The shfl up warp swizzling trait.
Definition: Traits.hpp:44
static __device__ auto shfl_xor([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, T val, std::int32_t mask, std::int32_t width) -> T
The shfl xor warp swizzling trait.
Definition: Traits.hpp:52
static __device__ auto shfl([[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const &warp, T val, int srcLane, std::int32_t width) -> T
The shfl warp swizzling trait.
Definition: Traits.hpp:40