13#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
22# if !defined(ALPAKA_HOST_ONLY)
24# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
25# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
28# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
29# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
44 struct GetSizeCompileTime<WarpUniformCudaHipBuiltIn>
48# if defined(__CUDA_ARCH__)
51# elif defined(__HIP_DEVICE_COMPILE__)
56# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
61# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
62 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
64# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
75 struct GetSizeUpperLimit<WarpUniformCudaHipBuiltIn>
79# if defined(__CUDA_ARCH__)
82# elif defined(__HIP_DEVICE_COMPILE__)
87# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
92# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
93 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
95# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
106 struct Activemask<WarpUniformCudaHipBuiltIn>
108 static __device__
auto activemask(warp::WarpUniformCudaHipBuiltIn
const& )
109# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
115# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
116 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
117 return __activemask();
126 struct All<WarpUniformCudaHipBuiltIn>
128 static __device__
auto all(
129 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
130 std::int32_t predicate) -> std::int32_t
132# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
133 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
134 return __all_sync(
activemask(warp), predicate);
136 return __all(predicate);
142 struct Any<WarpUniformCudaHipBuiltIn>
144 static __device__
auto any(
145 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
146 std::int32_t predicate) -> std::int32_t
148# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
149 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
150 return __any_sync(
activemask(warp), predicate);
152 return __any(predicate);
158 struct Ballot<WarpUniformCudaHipBuiltIn>
160 static __device__
auto ballot(
161 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
162 std::int32_t predicate)
164# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
170# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
171 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
172 return __ballot_sync(
activemask(warp), predicate);
174 return __ballot(predicate);
180 struct Shfl<WarpUniformCudaHipBuiltIn>
183 static __device__
auto shfl(
184 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
187 std::int32_t width) -> T
189# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
190 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
191 return __shfl_sync(
activemask(warp), val, srcLane, width);
193 return __shfl(val, srcLane, width);
199 struct ShflUp<WarpUniformCudaHipBuiltIn>
202 static __device__
auto shfl_up(
203 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
205 std::uint32_t offset,
206 std::int32_t width) -> T
208# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
209 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
210 return __shfl_up_sync(
activemask(warp), val, offset, width);
212 return __shfl_up(val, offset, width);
218 struct ShflDown<WarpUniformCudaHipBuiltIn>
222 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
224 std::uint32_t offset,
225 std::int32_t width) -> T
227# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
228 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
229 return __shfl_down_sync(
activemask(warp), val, offset, width);
231 return __shfl_down(val, offset, width);
237 struct ShflXor<WarpUniformCudaHipBuiltIn>
241 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
244 std::int32_t width) -> T
246# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) \
247 || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && ALPAKA_COMP_HIP >= ALPAKA_VERSION_NUMBER(6, 2, 0))
248 return __shfl_xor_sync(
activemask(warp), val, mask, width);
250 return __shfl_xor(val, mask, width);
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...