alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
QueueUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
2 * Antonio Di Pilato
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
10#include "alpaka/core/Hip.hpp"
12#include "alpaka/dev/Traits.hpp"
18
19#include <condition_variable>
20#include <functional>
21#include <future>
22#include <memory>
23#include <mutex>
24#include <thread>
25
26#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
27
28namespace alpaka
29{
30 template<typename TApi>
31 class EventUniformCudaHipRt;
32
33 template<typename TApi>
34 class DevUniformCudaHipRt;
35
36 namespace uniform_cuda_hip::detail
37 {
38 //! The CUDA/HIP RT queue implementation.
39 template<typename TApi>
41 {
42 public:
44 : m_dev(dev)
45 , m_UniformCudaHipQueue()
46 {
48
49 // Set the current device.
50 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
51
52 // - [cuda/hip]StreamDefault: Default queue creation flag.
53 // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run
54 // concurrently with work in queue 0 (the NULL queue),
55 // and that the created queue should perform no implicit synchronization with queue 0.
56 // Create the queue on the current device.
57 // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka
58 // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.
59
61 TApi::streamCreateWithFlags(&m_UniformCudaHipQueue, TApi::streamNonBlocking));
62 }
63
66
68 {
70
71 // Make sure all pending async work is finished before destroying the stream to guarantee determinism.
72 // This would not be necessary for plain CUDA/HIP operations, but we can have host functions in the
73 // stream, which reference this queue instance and its CallbackThread. Make sure they are done.
74 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamSynchronize(m_UniformCudaHipQueue));
75 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamDestroy(m_UniformCudaHipQueue));
76 }
77
78 [[nodiscard]] auto getNativeHandle() const noexcept
79 {
80 return m_UniformCudaHipQueue;
81 }
82
83 public:
84 DevUniformCudaHipRt<TApi> const m_dev; //!< The device this queue is bound to.
86
87 private:
88 typename TApi::Stream_t m_UniformCudaHipQueue;
89 };
90
91 //! The CUDA/HIP RT queue.
92 template<typename TApi, bool TBlocking>
94 : public interface::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRt<TApi, TBlocking>>
95 , public interface::Implements<ConceptQueue, QueueUniformCudaHipRt<TApi, TBlocking>>
96 , public interface::Implements<ConceptGetDev, QueueUniformCudaHipRt<TApi, TBlocking>>
97 {
98 public:
104
105 ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const& rhs) const -> bool
106 {
107 return (m_spQueueImpl == rhs.m_spQueueImpl);
108 }
109
110 ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const& rhs) const -> bool
111 {
112 return !((*this) == rhs);
113 }
114
115 [[nodiscard]] auto getNativeHandle() const noexcept
116 {
117 return m_spQueueImpl->getNativeHandle();
118 }
119
121 {
122 return m_spQueueImpl->m_callbackThread;
123 }
124
125 public:
126 std::shared_ptr<QueueUniformCudaHipRtImpl<TApi>> m_spQueueImpl;
127 };
128 } // namespace uniform_cuda_hip::detail
129
130 namespace trait
131 {
132 //! The CUDA/HIP RT queue device get trait specialization.
133 template<typename TApi, bool TBlocking>
134 struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
135 {
139 {
140 return queue.m_spQueueImpl->m_dev;
141 }
142 };
143
144 //! The CUDA/HIP RT queue test trait specialization.
145 template<typename TApi, bool TBlocking>
146 struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
147 {
150 {
152
153 // Query is allowed even for queues on non current device.
154 typename TApi::Error_t ret = TApi::success;
156 ret = TApi::streamQuery(queue.getNativeHandle()),
157 TApi::errorNotReady);
158 return (ret == TApi::success);
159 }
160 };
161
162 //! The CUDA/HIP RT queue thread wait trait specialization.
163 //!
164 //! Blocks execution of the calling thread until the queue has finished processing all previously requested
165 //! tasks (kernels, data copies, ...)
166 template<typename TApi, bool TBlocking>
167 struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
168 {
171 {
173
174 // Sync is allowed even for queues on non current device.
175 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
176 }
177 };
178
179 //! The CUDA/HIP RT blocking queue device type trait specialization.
180 template<typename TApi, bool TBlocking>
181 struct DevType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
182 {
184 };
185
186 //! The CUDA/HIP RT blocking queue event type trait specialization.
187 template<typename TApi, bool TBlocking>
188 struct EventType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
189 {
191 };
192
193 //! The CUDA/HIP RT blocking queue enqueue trait specialization.
194 template<typename TApi, bool TBlocking, typename TTask>
195 struct Enqueue<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>, TTask>
196 {
198
199 struct HostFuncData
200 {
201 // We don't need to keep the queue alive, because in it's dtor it will synchronize with the CUDA/HIP
202 // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
203 // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
204 // function. Passing it further to the Callback thread, would make the Callback thread hold a task
205 // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
206 // exists will make the CallbackThread join itself and crash.
208 TTask t;
209 };
210
212 {
213 auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
214 auto& queue = data->q;
215 auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); });
216 f.wait();
217 }
218
221 TTask const& task) -> void
222 {
223 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::launchHostFunc(
224 queue.getNativeHandle(),
225 uniformCudaHipRtHostFunc,
226 new HostFuncData{*queue.m_spQueueImpl, task}));
227 if constexpr(TBlocking)
228 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
229 }
230 };
231
232 //! The CUDA/HIP RT blocking queue native handle trait specialization.
233 template<typename TApi, bool TBlocking>
234 struct NativeHandle<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
235 {
236 [[nodiscard]] static auto getNativeHandle(
238 {
239 return queue.getNativeHandle();
240 }
241 };
242 } // namespace trait
243} // namespace alpaka
244
245#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd,...)
CUDA/HIP runtime error checking with log and exception, ignoring specific error values.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr< IDeviceQueue > spQueue) const -> void
Registers the given queue on this device. NOTE: Every queue has to be registered for correct function...
The CUDA/HIP RT device event.
auto operator=(QueueUniformCudaHipRtImpl &&) -> QueueUniformCudaHipRtImpl &=delete
QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl &&)=default
ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt< TApi > const &dev)
DevUniformCudaHipRt< TApi > const m_dev
The device this queue is bound to.
ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt< TApi > const &dev)
ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const &rhs) const -> bool
ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const &rhs) const -> bool
std::shared_ptr< QueueUniformCudaHipRtImpl< TApi > > m_spQueueImpl
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The alpaka accelerator library.
STL namespace.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
static ALPAKA_FN_HOST auto currentThreadWaitFor(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> void
The thread wait trait.
Definition Traits.hpp:21
The device type trait.
Definition Traits.hpp:23
static ALPAKA_FN_HOST auto empty(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> bool
The queue empty trait.
Definition Traits.hpp:31
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TTask const &task) -> void
The queue enqueue trait.
Definition Traits.hpp:27
The event type trait.
Definition Traits.hpp:17
static ALPAKA_FN_HOST auto getDev(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition Traits.hpp:27
static auto getNativeHandle(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue)
The native handle trait.
Definition Traits.hpp:17