13#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
22# if !defined(ALPAKA_HOST_ONLY)
24# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
25# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
28# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
29# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
44 struct GetSizeCompileTime<WarpUniformCudaHipBuiltIn>
48# if defined(__CUDA_ARCH__)
51# elif defined(__HIP_DEVICE_COMPILE__)
56# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
61# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
62 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
64# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
75 struct GetSizeUpperLimit<WarpUniformCudaHipBuiltIn>
79# if defined(__CUDA_ARCH__)
82# elif defined(__HIP_DEVICE_COMPILE__)
87# elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
92# ifdef ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
93 return ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
95# error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_AMD_WAVEFRONT_SIZE
106 struct Activemask<WarpUniformCudaHipBuiltIn>
108 __device__
static auto activemask(warp::WarpUniformCudaHipBuiltIn
const& )
109# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
115# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
116 return __activemask();
125 struct All<WarpUniformCudaHipBuiltIn>
127 __device__
static auto all(
128 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
129 std::int32_t predicate) -> std::int32_t
131# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
132 return __all_sync(0xffff'ffff, predicate);
134 return __all(predicate);
140 struct Any<WarpUniformCudaHipBuiltIn>
142 __device__
static auto any(
143 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
144 std::int32_t predicate) -> std::int32_t
146# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
147 return __any_sync(0xffff'ffff, predicate);
149 return __any(predicate);
155 struct Ballot<WarpUniformCudaHipBuiltIn>
157 __device__
static auto ballot(
158 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
159 std::int32_t predicate)
161# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
167# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
168 return __ballot_sync(0xffff'ffff, predicate);
170 return __ballot(predicate);
176 struct Shfl<WarpUniformCudaHipBuiltIn>
179 __device__
static auto shfl(
180 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
183 std::int32_t width) -> T
185# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
186 return __shfl_sync(0xffff'ffff, val, srcLane, width);
188 return __shfl(val, srcLane, width);
194 struct ShflUp<WarpUniformCudaHipBuiltIn>
197 __device__
static auto shfl_up(
198 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
200 std::uint32_t offset,
201 std::int32_t width) -> T
203# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
204 return __shfl_up_sync(0xffff'ffff, val, offset, width);
206 return __shfl_up(val, offset, width);
212 struct ShflDown<WarpUniformCudaHipBuiltIn>
216 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
218 std::uint32_t offset,
219 std::int32_t width) -> T
221# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
222 return __shfl_down_sync(0xffff'ffff, val, offset, width);
224 return __shfl_down(val, offset, width);
230 struct ShflXor<WarpUniformCudaHipBuiltIn>
234 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn
const& warp,
237 std::int32_t width) -> T
239# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
240 return __shfl_xor_sync(0xffff'ffff, val, mask, width);
242 return __shfl_xor(val, mask, width);
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto all(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto ballot(TWarp const &warp, std::int32_t predicate)
Evaluates predicate for all non-exited threads in a warp and returns a 32- or 64-bit unsigned integer...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_up(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with lower ID relative to caller....
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto activemask(TWarp const &warp) -> decltype(trait::Activemask< interface::ImplementationBase< ConceptWarp, TWarp > >::activemask(warp))
Returns a 32- or 64-bit unsigned integer (depending on the accelerator) whose Nth bit is set if and o...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeUpperLimit() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns an uppe...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto any(TWarp const &warp, std::int32_t predicate) -> std::int32_t
Evaluates predicate for all active threads of the warp and returns non-zero if and only if predicate ...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_down(TWarp const &warp, T value, std::uint32_t offset, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane with higher ID relative to caller....
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto getSize(TWarp const &warp) -> std::int32_t
Returns warp size.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl_xor(TWarp const &warp, T value, std::int32_t mask, std::int32_t width=0)
Exchange data between threads within a warp. It copies from a lane based on bitwise XOR of own lane I...
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC constexpr auto getSizeCompileTime() -> std::int32_t
If the warp size is available as a compile-time constant returns its value; otherwise returns 0.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto shfl(TWarp const &warp, T value, std::int32_t srcLane, std::int32_t width=0)
Exchange data between threads within a warp.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...