alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
QueueUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2025 Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
2 * Antonio Di Pilato
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
10#include "alpaka/core/Hip.hpp"
12#include "alpaka/dev/Traits.hpp"
19
20#include <condition_variable>
21#include <functional>
22#include <future>
23#include <memory>
24#include <mutex>
25#include <thread>
26
27#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
28
29namespace alpaka
30{
31 template<typename TApi>
32 class EventUniformCudaHipRt;
33
34 template<typename TApi>
35 class DevUniformCudaHipRt;
36
37 template<typename TApi>
38 struct PlatformUniformCudaHipRt;
39
40 namespace uniform_cuda_hip::detail
41 {
42 //! The CUDA/HIP RT queue implementation.
43 template<typename TApi>
45 {
46 public:
48 : m_dev(dev)
49 , m_UniformCudaHipQueue()
50 , m_isOwning(true)
51 {
53
54 // Set the current device.
55 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
56
57 // - [cuda/hip]StreamDefault: Default queue creation flag.
58 // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run
59 // concurrently with work in queue 0 (the NULL queue),
60 // and that the created queue should perform no implicit synchronization with queue 0.
61 // Create the queue on the current device.
62 // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka
63 // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.
64
66 TApi::streamCreateWithFlags(&m_UniformCudaHipQueue, TApi::streamNonBlocking));
67 }
68
69 ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(typename TApi::Stream_t stream)
72 static_cast<std::size_t>(TApi::getCurrentDevice())))
73 , m_UniformCudaHipQueue(stream)
74 , m_isOwning(false)
75 {
76 }
77
80
82 {
84
85 // Make sure all pending async work is finished before destroying the stream to guarantee determinism.
86 // This would not be necessary for plain CUDA/HIP operations, but we can have host functions in the
87 // stream, which reference this queue instance and its CallbackThread. Make sure they are done.
88 if(m_isOwning)
89 {
90 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamSynchronize(m_UniformCudaHipQueue));
91 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamDestroy(m_UniformCudaHipQueue));
92 }
93 }
94
95 [[nodiscard]] auto getNativeHandle() const noexcept
96 {
97 return m_UniformCudaHipQueue;
98 }
99
100 public:
101 DevUniformCudaHipRt<TApi> const m_dev; //!< The device this queue is bound to.
103
104 private:
105 typename TApi::Stream_t m_UniformCudaHipQueue;
106 bool m_isOwning;
107 };
108
109 //! The CUDA/HIP RT queue.
110 template<typename TApi, bool TBlocking>
112 : public interface::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRt<TApi, TBlocking>>
113 , public interface::Implements<ConceptQueue, QueueUniformCudaHipRt<TApi, TBlocking>>
114 , public interface::Implements<ConceptGetDev, QueueUniformCudaHipRt<TApi, TBlocking>>
115 {
116 public:
122
123 ALPAKA_FN_HOST QueueUniformCudaHipRt(typename TApi::Stream_t stream)
124 : m_spQueueImpl(std::make_shared<QueueUniformCudaHipRtImpl<TApi>>(stream))
125 {
126 m_spQueueImpl->m_dev.registerQueue(m_spQueueImpl);
127 }
128
129 ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const& rhs) const -> bool
130 {
131 return (m_spQueueImpl == rhs.m_spQueueImpl);
132 }
133
134 ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const& rhs) const -> bool
135 {
136 return !((*this) == rhs);
137 }
138
139 [[nodiscard]] auto getNativeHandle() const noexcept
140 {
141 return m_spQueueImpl->getNativeHandle();
142 }
143
145 {
146 return m_spQueueImpl->m_callbackThread;
147 }
148
149 public:
150 std::shared_ptr<QueueUniformCudaHipRtImpl<TApi>> m_spQueueImpl;
151 };
152 } // namespace uniform_cuda_hip::detail
153
154 namespace trait
155 {
156 //! The CUDA/HIP RT queue device get trait specialization.
157 template<typename TApi, bool TBlocking>
158 struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
159 {
163 {
164 return queue.m_spQueueImpl->m_dev;
165 }
166 };
167
168 //! The CUDA/HIP RT queue test trait specialization.
169 template<typename TApi, bool TBlocking>
170 struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
171 {
174 {
176
177 // Query is allowed even for queues on non current device.
178 typename TApi::Error_t ret = TApi::success;
180 ret = TApi::streamQuery(queue.getNativeHandle()),
181 TApi::errorNotReady);
182 return (ret == TApi::success);
183 }
184 };
185
186 //! The CUDA/HIP RT queue thread wait trait specialization.
187 //!
188 //! Blocks execution of the calling thread until the queue has finished processing all previously requested
189 //! tasks (kernels, data copies, ...)
190 template<typename TApi, bool TBlocking>
191 struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
192 {
195 {
197
198 // Sync is allowed even for queues on non current device.
199 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
200 }
201 };
202
203 //! The CUDA/HIP RT blocking queue device type trait specialization.
204 template<typename TApi, bool TBlocking>
205 struct DevType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
206 {
208 };
209
210 //! The CUDA/HIP RT blocking queue event type trait specialization.
211 template<typename TApi, bool TBlocking>
212 struct EventType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
213 {
215 };
216
217 //! The CUDA/HIP RT blocking queue enqueue trait specialization.
218 template<typename TApi, bool TBlocking, typename TTask>
219 struct Enqueue<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>, TTask>
220 {
222
223 struct HostFuncData
224 {
225 // We don't need to keep the queue alive, because in it's dtor it will synchronize with the CUDA/HIP
226 // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
227 // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
228 // function. Passing it further to the Callback thread, would make the Callback thread hold a task
229 // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
230 // exists will make the CallbackThread join itself and crash.
232 TTask t;
233 };
234
236 {
237 auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
238 auto& queue = data->q;
239 auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); });
240 f.wait();
241 }
242
245 TTask const& task) -> void
246 {
247 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::launchHostFunc(
248 queue.getNativeHandle(),
249 uniformCudaHipRtHostFunc,
250 new HostFuncData{*queue.m_spQueueImpl, task}));
251 if constexpr(TBlocking)
252 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
253 }
254 };
255
256 //! The CUDA/HIP RT blocking queue native handle trait specialization.
257 template<typename TApi, bool TBlocking>
258 struct NativeHandle<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
259 {
260 [[nodiscard]] static auto getNativeHandle(
262 {
263 return queue.getNativeHandle();
264 }
265 };
266 } // namespace trait
267} // namespace alpaka
268
269#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd,...)
CUDA/HIP runtime error checking with log and exception, ignoring specific error values.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr< IDeviceQueue > spQueue) const -> void
Registers the given queue on this device. NOTE: Every queue has to be registered for correct function...
The CUDA/HIP RT device event.
auto operator=(QueueUniformCudaHipRtImpl &&) -> QueueUniformCudaHipRtImpl &=delete
QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl &&)=default
ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt< TApi > const &dev)
DevUniformCudaHipRt< TApi > const m_dev
The device this queue is bound to.
ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(typename TApi::Stream_t stream)
ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt< TApi > const &dev)
ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const &rhs) const -> bool
ALPAKA_FN_HOST QueueUniformCudaHipRt(typename TApi::Stream_t stream)
ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const &rhs) const -> bool
std::shared_ptr< QueueUniformCudaHipRtImpl< TApi > > m_spQueueImpl
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The alpaka accelerator library.
ALPAKA_FN_HOST auto getDevByIdx(TPlatform const &platform, std::size_t const &devIdx) -> Dev< TPlatform >
Definition Traits.hpp:62
STL namespace.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
static ALPAKA_FN_HOST auto currentThreadWaitFor(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> void
The thread wait trait.
Definition Traits.hpp:21
The device type trait.
Definition Traits.hpp:23
static ALPAKA_FN_HOST auto empty(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> bool
The queue empty trait.
Definition Traits.hpp:31
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TTask const &task) -> void
The queue enqueue trait.
Definition Traits.hpp:27
The event type trait.
Definition Traits.hpp:17
static ALPAKA_FN_HOST auto getDev(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition Traits.hpp:27
static auto getNativeHandle(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue)
The native handle trait.
Definition Traits.hpp:17