alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
MemFenceUniformCudaHipBuiltIn.hpp
Go to the documentation of this file.
1/* Copyright 2022 Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Tapish Narwal
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
9#include "alpaka/core/PP.hpp"
14
15#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
16
17namespace alpaka
18{
19 //! The GPU CUDA/HIP memory fence.
20 class MemFenceUniformCudaHipBuiltIn : public interface::Implements<ConceptMemFence, MemFenceUniformCudaHipBuiltIn>
21 {
22 };
23
24# if !defined(ALPAKA_HOST_ONLY)
25# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !ALPAKA_LANG_CUDA
26# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
27# endif
28
29# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !ALPAKA_LANG_HIP
30# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
31# endif
32
33
34 namespace detail
35 {
36 // For CUDA > 12.8 the compiler inbuilt __nv_atomic_thread_fence is available for compute
37 // capability versions > 7. NVCC defines __CUDACC_DEVICE_ATOMIC_BUILTINS__ when built-in atomic functions are
38 // supported by the compute capability. Im not sure how this will work with clang-cuda. Currently the inline
39 // ptx version of the code is suffiecient for thread fences.
40 template<alpaka::MemoryOrder TMemOrder>
41 [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_device([[maybe_unused]] TMemOrder order)
42 {
43# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
44 // full acquire/release semantics support
45 if constexpr(std::is_same_v<TMemOrder, mem_order::Relaxed>)
46 { // Relaxed ordering requires no fence
47 }
48 else if constexpr(std::is_same_v<TMemOrder, mem_order::Acquire>)
49 {
50 asm volatile("fence.acquire.gpu;" ::);
51 }
52 else if constexpr(std::is_same_v<TMemOrder, mem_order::Release>)
53 {
54 asm volatile("fence.release.gpu;" ::);
55 }
56 else if constexpr(std::is_same_v<TMemOrder, mem_order::AcqRel>)
57 {
58 asm volatile("fence.acq_rel.gpu;" ::);
59 }
60 else
61 { // Sequential consistency
62 asm volatile("fence.sc.gpu;" ::);
63 }
64# elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
65 // only acq_rel and sc available
66 if constexpr(std::is_same_v<TMemOrder, mem_order::Relaxed>)
67 { // Relaxed ordering requires no fence
68 }
69 else if constexpr(std::is_same_v<TMemOrder, mem_order::Acquire>)
70 {
71 asm volatile("fence.acq_rel.gpu;" ::);
72 }
73 else if constexpr(std::is_same_v<TMemOrder, mem_order::Release>)
74 {
75 asm volatile("fence.acq_rel.gpu;" ::);
76 }
77 else if constexpr(std::is_same_v<TMemOrder, mem_order::AcqRel>)
78 {
79 asm volatile("fence.acq_rel.gpu;" ::);
80 }
81 else
82 {
83 // Sequential consistency
84 asm volatile("fence.sc.gpu;" ::);
85 }
86# endif
87 }
88
89 template<alpaka::MemoryOrder TMemOrder>
90 [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_block([[maybe_unused]] TMemOrder order)
91 {
92# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
93 // full acquire/release semantics support
94 if constexpr(std::is_same_v<TMemOrder, mem_order::Relaxed>)
95 { // Relaxed ordering requires no fence
96 }
97 else if constexpr(std::is_same_v<TMemOrder, mem_order::Acquire>)
98 {
99 asm volatile("fence.acquire.cta;" ::);
100 }
101 else if constexpr(std::is_same_v<TMemOrder, mem_order::Release>)
102 {
103 asm volatile("fence.release.cta;" ::);
104 }
105 else if constexpr(std::is_same_v<TMemOrder, mem_order::AcqRel>)
106 {
107 asm volatile("fence.acq_rel.cta;" ::);
108 }
109 else
110 { // Sequential consistency
111 asm volatile("fence.sc.cta;" ::);
112 }
113# elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
114 // only acq_rel and sc available
115 if constexpr(std::is_same_v<TMemOrder, mem_order::Relaxed>)
116 { // Relaxed ordering requires no fence
117 }
118 else if constexpr(std::is_same_v<TMemOrder, mem_order::Acquire>)
119 {
120 asm volatile("fence.acq_rel.cta;" ::);
121 }
122 else if constexpr(std::is_same_v<TMemOrder, mem_order::Release>)
123 {
124 asm volatile("fence.acq_rel.cta;" ::);
125 }
126 else if constexpr(std::is_same_v<TMemOrder, mem_order::AcqRel>)
127 {
128 asm volatile("fence.acq_rel.cta;" ::);
129 }
130 else
131 { // Sequential consistency
132 asm volatile("fence.sc.cta;" ::);
133 }
134# endif
135 }
136
137 template<alpaka::MemoryOrder TMemOrder>
138 [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_block([[maybe_unused]] TMemOrder order)
139 {
140 if constexpr(std::is_same_v<TMemOrder, mem_order::Relaxed>)
141 { // Relaxed ordering requires no fence
142 return;
143 }
144# ifdef ALPAKA_CUDA_ATOMIC
145 ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_block);
146# else
147# if ALPAKA_ARCH_PTX
148# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
149 cuda_ptx_fence_block(order);
150# else
151 __threadfence_block();
152# endif
153# endif
154# endif
155 }
156
157 template<alpaka::MemoryOrder TMemOrder>
158 [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_device([[maybe_unused]] TMemOrder order)
159 {
160 if constexpr(std::is_same_v<TMemOrder, mem_order::Relaxed>)
161 { // Relaxed ordering requires no fence
162 return;
163 }
164# ifdef ALPAKA_CUDA_ATOMIC
165 ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_device);
166# else
167# if ALPAKA_ARCH_PTX
168# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
169 cuda_ptx_fence_device(order);
170# else
171 __threadfence();
172# endif
173# endif
174# endif
175 }
176 } // namespace detail
177
178 namespace trait
179 {
180 template<>
181 struct MemFenceDefaultOrder<MemFenceUniformCudaHipBuiltIn>
182 {
183 using type = mem_order::SeqCst;
184 static constexpr auto value = mem_order::seq_cst;
185 };
186
187 template<MemoryOrder TMemOrder>
188 struct MemFence<MemFenceUniformCudaHipBuiltIn, TMemOrder, memory_scope::Block>
189 {
190 static __device__ auto mem_fence(
191 MemFenceUniformCudaHipBuiltIn const&,
192 TMemOrder order,
193 memory_scope::Block const&)
194 {
195# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
196 alpaka::detail::cuda_mem_fence_block(order);
197# else
198 __builtin_amdgcn_fence(MemOrderHip::get(order), "workgroup");
199# endif
200 }
201 };
202
203 template<MemoryOrder TMemOrder, typename TMemScope>
204 struct MemFence<MemFenceUniformCudaHipBuiltIn, TMemOrder, TMemScope>
205 {
206 static __device__ auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, TMemOrder order, TMemScope const&)
207 {
208 // Base case for grid and device scope fences.
209 // CUDA and HIP do not have a per-grid memory fence, so a device-level fence is used
210# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
211 alpaka::detail::cuda_mem_fence_device(order);
212# else
213 __builtin_amdgcn_fence(MemOrderHip::get(order), "agent");
214# endif
215 }
216 };
217
218 } // namespace trait
219# endif
220
221} // namespace alpaka
222
223#endif
static constexpr SeqCst seq_cst
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto mem_fence(TMemFence const &fence, TMemOrder order, TMemScope const &scope) -> void
Issues memory fence instructions.
Definition Traits.hpp:80
static constexpr auto get(TMemOrder)
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15