alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
BlockSyncUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
10
11#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
12
13namespace alpaka
14{
15 //! The GPU CUDA/HIP block synchronization.
17 : public interface::Implements<ConceptBlockSync, BlockSyncUniformCudaHipBuiltIn>
18 {
19 };
20
21# if !defined(ALPAKA_HOST_ONLY)
22
23# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
24# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
25# endif
26
27# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
28# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
29# endif
30
31 namespace trait
32 {
33 template<>
34 struct SyncBlockThreads<BlockSyncUniformCudaHipBuiltIn>
35 {
36 __device__ static auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/) -> void
37 {
38 __syncthreads();
39 }
40 };
41
42 template<>
43 struct SyncBlockThreadsPredicate<BlockCount, BlockSyncUniformCudaHipBuiltIn>
44 {
45 __device__ static auto syncBlockThreadsPredicate(
46 BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
47 int predicate) -> int
48 {
49# if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && ALPAKA_COMP_HIP
50# if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
51# pragma clang diagnostic push
52# pragma clang diagnostic ignored "-Wunique-object-duplication"
53# endif
54 // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
55 __shared__ int tmp;
56# if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
57# pragma clang diagnostic pop
58# endif
59 __syncthreads();
60 if(threadIdx.x == 0)
61 tmp = 0;
62 __syncthreads();
63 if(predicate)
64 ::atomicAdd(&tmp, 1);
65 __syncthreads();
66
67 return tmp;
68# else
69 return __syncthreads_count(predicate);
70# endif
71 }
72 };
73
74 template<>
75 struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncUniformCudaHipBuiltIn>
76 {
77 __device__ static auto syncBlockThreadsPredicate(
78 BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
79 int predicate) -> int
80 {
81# if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && ALPAKA_COMP_HIP
82# if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
83# pragma clang diagnostic push
84# pragma clang diagnostic ignored "-Wunique-object-duplication"
85# endif
86 // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
87 __shared__ int tmp;
88# if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
89# pragma clang diagnostic pop
90# endif
91 __syncthreads();
92 if(threadIdx.x == 0)
93 tmp = 1;
94 __syncthreads();
95 if(!predicate)
96 ::atomicAnd(&tmp, 0);
97 __syncthreads();
98
99 return tmp;
100# else
101 return __syncthreads_and(predicate);
102# endif
103 }
104 };
105
106 template<>
107 struct SyncBlockThreadsPredicate<BlockOr, BlockSyncUniformCudaHipBuiltIn>
108 {
109 __device__ static auto syncBlockThreadsPredicate(
110 BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
111 int predicate) -> int
112 {
113# if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && ALPAKA_COMP_HIP
114# if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
115# pragma clang diagnostic push
116# pragma clang diagnostic ignored "-Wunique-object-duplication"
117# endif
118 // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
119 __shared__ int tmp;
120# if ALPAKA_COMP_CLANG >= ALPAKA_VERSION_NUMBER(21, 0, 0)
121# pragma clang diagnostic pop
122# endif
123 __syncthreads();
124 if(threadIdx.x == 0)
125 tmp = 0;
126 __syncthreads();
127 if(predicate)
128 ::atomicOr(&tmp, 1);
129 __syncthreads();
130
131 return tmp;
132# else
133 return __syncthreads_or(predicate);
134# endif
135 }
136 };
137 } // namespace trait
138
139# endif
140
141} // namespace alpaka
142
143#endif
The GPU CUDA/HIP block synchronization.
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const &blockSync) -> void
Synchronizes all threads within the current block (independently for all blocks).
Definition Traits.hpp:36
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAdd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic add operation.
Definition Traits.hpp:114
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreadsPredicate(TBlockSync const &blockSync, int predicate) -> int
Synchronizes all threads within the current block (independently for all blocks), evaluates the predi...
Definition Traits.hpp:100
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicOr(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic or operation.
Definition Traits.hpp:288
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAnd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic and operation.
Definition Traits.hpp:270
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15