alpaka
Abstraction Library for Parallel Kernel Acceleration
BlockSyncUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
10 
11 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
12 
13 namespace alpaka
14 {
15  //! The GPU CUDA/HIP block synchronization.
17  : public interface::Implements<ConceptBlockSync, BlockSyncUniformCudaHipBuiltIn>
18  {
19  };
20 
21 # if !defined(ALPAKA_HOST_ONLY)
22 
23 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
24 # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
25 # endif
26 
27 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
28 # error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
29 # endif
30 
31  namespace trait
32  {
33  template<>
35  {
36  __device__ static auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/) -> void
37  {
38  __syncthreads();
39  }
40  };
41 
42  template<>
44  {
45  __device__ static auto syncBlockThreadsPredicate(
46  BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
47  int predicate) -> int
48  {
49 # if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
50  // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
51  __shared__ int tmp;
52  __syncthreads();
53  if(threadIdx.x == 0)
54  tmp = 0;
55  __syncthreads();
56  if(predicate)
57  ::atomicAdd(&tmp, 1);
58  __syncthreads();
59 
60  return tmp;
61 # else
62  return __syncthreads_count(predicate);
63 # endif
64  }
65  };
66 
67  template<>
69  {
70  __device__ static auto syncBlockThreadsPredicate(
71  BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
72  int predicate) -> int
73  {
74 # if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
75  // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
76  __shared__ int tmp;
77  __syncthreads();
78  if(threadIdx.x == 0)
79  tmp = 1;
80  __syncthreads();
81  if(!predicate)
82  ::atomicAnd(&tmp, 0);
83  __syncthreads();
84 
85  return tmp;
86 # else
87  return __syncthreads_and(predicate);
88 # endif
89  }
90  };
91 
92  template<>
94  {
95  __device__ static auto syncBlockThreadsPredicate(
96  BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
97  int predicate) -> int
98  {
99 # if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
100  // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
101  __shared__ int tmp;
102  __syncthreads();
103  if(threadIdx.x == 0)
104  tmp = 0;
105  __syncthreads();
106  if(predicate)
107  ::atomicOr(&tmp, 1);
108  __syncthreads();
109 
110  return tmp;
111 # else
112  return __syncthreads_or(predicate);
113 # endif
114  }
115  };
116  } // namespace trait
117 
118 # endif
119 
120 } // namespace alpaka
121 
122 #endif
The GPU CUDA/HIP block synchronization.
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAdd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic add operation.
Definition: Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicOr(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic or operation.
Definition: Traits.hpp:258
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAnd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic and operation.
Definition: Traits.hpp:240
The logical and function object.
Definition: Traits.hpp:60
The counting function object.
Definition: Traits.hpp:44
The logical or function object.
Definition: Traits.hpp:76
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition: Interface.hpp:15
static __device__ auto syncBlockThreadsPredicate(BlockSyncUniformCudaHipBuiltIn const &, int predicate) -> int
static __device__ auto syncBlockThreadsPredicate(BlockSyncUniformCudaHipBuiltIn const &, int predicate) -> int
static __device__ auto syncBlockThreadsPredicate(BlockSyncUniformCudaHipBuiltIn const &, int predicate) -> int
The block synchronization and predicate operation trait.
Definition: Traits.hpp:27
static __device__ auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const &) -> void
The block synchronization operation trait.
Definition: Traits.hpp:23