alpaka
Abstraction Library for Parallel Kernel Acceleration
QueueUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Matthias Werner, RenĂ© Widera, Andrea Bocci, Bernhard Manfred Gruber,
2  * Antonio Di Pilato
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
9 #include "alpaka/core/Cuda.hpp"
10 #include "alpaka/core/Hip.hpp"
12 #include "alpaka/dev/Traits.hpp"
13 #include "alpaka/event/Traits.hpp"
15 #include "alpaka/queue/Traits.hpp"
16 #include "alpaka/traits/Traits.hpp"
17 #include "alpaka/wait/Traits.hpp"
18 
19 #include <condition_variable>
20 #include <functional>
21 #include <future>
22 #include <memory>
23 #include <mutex>
24 #include <thread>
25 
26 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
27 
28 namespace alpaka
29 {
30  template<typename TApi>
31  class EventUniformCudaHipRt;
32 
33  template<typename TApi>
34  class DevUniformCudaHipRt;
35 
36  namespace uniform_cuda_hip::detail
37  {
38  //! The CUDA/HIP RT queue implementation.
39  template<typename TApi>
41  {
42  public:
44  : m_dev(dev)
45  , m_UniformCudaHipQueue()
46  {
48 
49  // Set the current device.
50  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
51 
52  // - [cuda/hip]StreamDefault: Default queue creation flag.
53  // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run
54  // concurrently with work in queue 0 (the NULL queue),
55  // and that the created queue should perform no implicit synchronization with queue 0.
56  // Create the queue on the current device.
57  // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka
58  // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.
59 
61  TApi::streamCreateWithFlags(&m_UniformCudaHipQueue, TApi::streamNonBlocking));
62  }
63 
66 
68  {
70 
71  // Make sure all pending async work is finished before destroying the stream to guarantee determinism.
72  // This would not be necessary for plain CUDA/HIP operations, but we can have host functions in the
73  // stream, which reference this queue instance and its CallbackThread. Make sure they are done.
74  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamSynchronize(m_UniformCudaHipQueue));
75  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamDestroy(m_UniformCudaHipQueue));
76  }
77 
78  [[nodiscard]] auto getNativeHandle() const noexcept
79  {
80  return m_UniformCudaHipQueue;
81  }
82 
83  public:
84  DevUniformCudaHipRt<TApi> const m_dev; //!< The device this queue is bound to.
86 
87  private:
88  typename TApi::Stream_t m_UniformCudaHipQueue;
89  };
90 
91  //! The CUDA/HIP RT queue.
92  template<typename TApi, bool TBlocking>
94  : public interface::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRt<TApi, TBlocking>>
95  , public interface::Implements<ConceptQueue, QueueUniformCudaHipRt<TApi, TBlocking>>
96  , public interface::Implements<ConceptGetDev, QueueUniformCudaHipRt<TApi, TBlocking>>
97  {
98  public:
100  : m_spQueueImpl(std::make_shared<QueueUniformCudaHipRtImpl<TApi>>(dev))
101  {
103  }
104 
105  ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const& rhs) const -> bool
106  {
107  return (m_spQueueImpl == rhs.m_spQueueImpl);
108  }
109 
110  ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const& rhs) const -> bool
111  {
112  return !((*this) == rhs);
113  }
114 
115  [[nodiscard]] auto getNativeHandle() const noexcept
116  {
117  return m_spQueueImpl->getNativeHandle();
118  }
119 
121  {
122  return m_spQueueImpl->m_callbackThread;
123  }
124 
125  public:
126  std::shared_ptr<QueueUniformCudaHipRtImpl<TApi>> m_spQueueImpl;
127  };
128  } // namespace uniform_cuda_hip::detail
129 
130  namespace trait
131  {
132  //! The CUDA/HIP RT queue device get trait specialization.
133  template<typename TApi, bool TBlocking>
134  struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
135  {
136  ALPAKA_FN_HOST static auto getDev(
139  {
140  return queue.m_spQueueImpl->m_dev;
141  }
142  };
143 
144  //! The CUDA/HIP RT queue test trait specialization.
145  template<typename TApi, bool TBlocking>
146  struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
147  {
148  ALPAKA_FN_HOST static auto empty(
150  {
152 
153  // Query is allowed even for queues on non current device.
154  typename TApi::Error_t ret = TApi::success;
156  ret = TApi::streamQuery(queue.getNativeHandle()),
157  TApi::errorNotReady);
158  return (ret == TApi::success);
159  }
160  };
161 
162  //! The CUDA/HIP RT queue thread wait trait specialization.
163  //!
164  //! Blocks execution of the calling thread until the queue has finished processing all previously requested
165  //! tasks (kernels, data copies, ...)
166  template<typename TApi, bool TBlocking>
167  struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
168  {
171  {
173 
174  // Sync is allowed even for queues on non current device.
175  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
176  }
177  };
178 
179  //! The CUDA/HIP RT blocking queue device type trait specialization.
180  template<typename TApi, bool TBlocking>
181  struct DevType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
182  {
184  };
185 
186  //! The CUDA/HIP RT blocking queue event type trait specialization.
187  template<typename TApi, bool TBlocking>
188  struct EventType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
189  {
191  };
192 
193  //! The CUDA/HIP RT blocking queue enqueue trait specialization.
194  template<typename TApi, bool TBlocking, typename TTask>
195  struct Enqueue<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>, TTask>
196  {
198 
199  struct HostFuncData
200  {
201  // We don't need to keep the queue alive, because in it's dtor it will synchronize with the CUDA/HIP
202  // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
203  // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
204  // function. Passing it further to the Callback thread, would make the Callback thread hold a task
205  // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
206  // exists will make the CallbackThread join itself and crash.
208  TTask t;
209  };
210 
212  {
213  auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
214  auto& queue = data->q;
215  auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); });
216  f.wait();
217  }
218 
219  ALPAKA_FN_HOST static auto enqueue(
221  TTask const& task) -> void
222  {
223  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::launchHostFunc(
224  queue.getNativeHandle(),
225  uniformCudaHipRtHostFunc,
226  new HostFuncData{*queue.m_spQueueImpl, task}));
227  if constexpr(TBlocking)
228  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
229  }
230  };
231 
232  //! The CUDA/HIP RT blocking queue native handle trait specialization.
233  template<typename TApi, bool TBlocking>
234  struct NativeHandle<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
235  {
236  [[nodiscard]] static auto getNativeHandle(
238  {
239  return queue.getNativeHandle();
240  }
241  };
242  } // namespace trait
243 } // namespace alpaka
244 
245 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd,...)
CUDA/HIP runtime error checking with log and exception, ignoring specific error values.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr< IDeviceQueue > spQueue) const -> void
Registers the given queue on this device. NOTE: Every queue has to be registered for correct function...
The CUDA/HIP RT device event.
auto operator=(QueueUniformCudaHipRtImpl &&) -> QueueUniformCudaHipRtImpl &=delete
QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl &&)=default
ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt< TApi > const &dev)
DevUniformCudaHipRt< TApi > const m_dev
The device this queue is bound to.
ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt< TApi > const &dev)
ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const &rhs) const -> bool
ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const &rhs) const -> bool
std::shared_ptr< QueueUniformCudaHipRtImpl< TApi > > m_spQueueImpl
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
constexpr ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC T arg(Complex< T > const &x)
Argument.
Definition: Complex.hpp:402
The alpaka accelerator library.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition: Interface.hpp:15
static ALPAKA_FN_HOST auto currentThreadWaitFor(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> void
The thread wait trait.
Definition: Traits.hpp:21
The device type trait.
Definition: Traits.hpp:23
static ALPAKA_FN_HOST auto empty(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> bool
The queue empty trait.
Definition: Traits.hpp:31
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TTask const &task) -> void
The queue enqueue trait.
Definition: Traits.hpp:27
The event type trait.
Definition: Traits.hpp:17
static ALPAKA_FN_HOST auto getDev(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition: Traits.hpp:27
static auto getNativeHandle(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue)
The native handle trait.
Definition: Traits.hpp:17