alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
QueueCpuOmp2Collective.hpp
Go to the documentation of this file.
1/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
18
19#include <atomic>
20#include <mutex>
21
22#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
23
24# if _OPENMP < 200203
25# error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
26# endif
27
28# include <omp.h>
29
30namespace alpaka
31{
32 namespace cpu::detail
33 {
34# if BOOST_COMP_CLANG
35// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
36// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
37# pragma clang diagnostic push
38# pragma clang diagnostic ignored "-Wweak-vtables"
39# endif
40 //! The CPU collective device queue implementation.
42# if BOOST_COMP_CLANG
43# pragma clang diagnostic pop
44# endif
45 {
46 explicit QueueCpuOmp2CollectiveImpl(DevCpu const& dev) noexcept : m_dev(dev), blockingQueue(dev)
47 {
48 }
49
50 void enqueue(EventCpu& ev) final
51 {
52 alpaka::enqueue(*this, ev);
53 }
54
55 void wait(EventCpu const& ev) final
56 {
57 alpaka::wait(*this, ev);
58 }
59
61 {
62 while(!empty(blockingQueue))
63 ;
64 }
65
66 DevCpu const m_dev; //!< The device this queue is bound to.
67 std::mutex mutable m_mutex;
69 std::atomic<uint32_t> m_uCurrentlyExecutingTask = 0;
70 };
71 } // namespace cpu::detail
72
73 //! The CPU collective device queue.
74 //
75 // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
76 //
77 // This queue is an example for a user provided queue and the behavior is strongly coupled
78 // to the user workflows.
79 //
80 // Within an OpenMP parallel region kernel will be performed collectively.
81 // All other operations will be performed from one thread (it is not defined which thread) and there will be no
82 // implicit synchronization between other operations within the parallel OpenMP parallel region. Operations
83 // executed within a OpenMP parallel region will be executed after already queued tasks before the parallel region
84 // was created.
85 //
86 // Outside of an OpenMP parallel region the queue behaves like QueueCpuBlocking.
87 struct QueueCpuOmp2Collective final : interface::Implements<ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
88 {
89 explicit QueueCpuOmp2Collective(DevCpu const& dev)
90 : m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev))
91 {
93 }
94
95 auto operator==(QueueCpuOmp2Collective const& rhs) const -> bool
96 {
97 return m_spQueueImpl == rhs.m_spQueueImpl;
98 }
99
100 auto operator!=(QueueCpuOmp2Collective const& rhs) const -> bool
101 {
102 return !((*this) == rhs);
103 }
104
105 std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
106 };
107
108 namespace trait
109 {
110 //! The CPU blocking device queue device type trait specialization.
111 template<>
113 {
114 using type = DevCpu;
115 };
116
117 //! The CPU blocking device queue device get trait specialization.
118 template<>
120 {
122 {
123 return queue.m_spQueueImpl->m_dev;
124 }
125 };
126
127 //! The CPU blocking device queue event type trait specialization.
128 template<>
130 {
131 using type = EventCpu;
132 };
133
134 //! The CPU blocking device queue enqueue trait specialization.
135 //! This default implementation for all tasks directly invokes the function call operator of the task.
136 template<typename TTask>
138 {
139 ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, TTask const& task) -> void
140 {
141 if(::omp_in_parallel() != 0)
142 {
143 // wait for all tasks enqueued before the parallel region
144 queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
145 ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
146# pragma omp single nowait
147 task();
148 --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
149 }
150 else
151 {
152 std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
153 alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
154 }
155 }
156 };
157
158 //! The CPU blocking device queue test trait specialization.
159 template<>
161 {
162 ALPAKA_FN_HOST static auto empty(QueueCpuOmp2Collective const& queue) -> bool
163 {
164 return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u
165 && alpaka::empty(queue.m_spQueueImpl->blockingQueue);
166 }
167 };
168
169 //! The CPU OpenMP2 collective device queue enqueue trait specialization.
170 template<>
171 struct Enqueue<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
172 {
174 {
176
177# pragma omp barrier
178 }
179 };
180
181 //! The CPU OpenMP2 collective device queue enqueue trait specialization.
182 template<>
184 {
185 ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, EventCpu& event) -> void
186 {
188
189 if(::omp_in_parallel() != 0)
190 {
191 // wait for all tasks en-queued before the parallel region
192 queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
193# pragma omp barrier
194 }
195 else
196 {
197 alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, event);
198 }
199 }
200 };
201
202 //! The CPU blocking device queue enqueue trait specialization.
203 //! This default implementation for all tasks directly invokes the function call operator of the task.
204 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
205 struct Enqueue<QueueCpuOmp2Collective, TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
206 {
210 {
211 if(::omp_in_parallel() != 0)
212 {
213 queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
214 ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
215 // execute task within an OpenMP parallel region
216 task();
217 --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
218 }
219 else
220 {
221 std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
222 alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
223 }
224 }
225 };
226
227 template<>
228 struct Enqueue<QueueCpuOmp2Collective, test::EventHostManualTriggerCpu<>>
229 {
231 {
232 // EventHostManualTriggerCpu are not supported for together with the queue
233 // QueueCpuOmp2Collective but a specialization is needed to path the EventTests
234 }
235 };
236
237 //! The CPU blocking device queue thread wait trait specialization.
238 //!
239 //! Blocks execution of the calling thread until the queue has finished processing all previously requested
240 //! tasks (kernels, data copies, ...)
241 template<>
243 {
245 {
246 if(::omp_in_parallel() != 0)
247 {
248 // wait for all tasks en-queued before the parallel region
249 queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
250# pragma omp barrier
251 }
252 else
253 {
254 std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
255 wait(queue.m_spQueueImpl->blockingQueue);
256 }
257 }
258 };
259
260 //! The CPU OpenMP2 collective device queue event wait trait specialization.
261 template<>
262 struct WaiterWaitFor<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
263 {
265 {
266# pragma omp barrier
267 }
268 };
269
270 //! The CPU OpenMP2 collective queue event wait trait specialization.
271 template<>
273 {
274 ALPAKA_FN_HOST static auto waiterWaitFor(QueueCpuOmp2Collective& queue, EventCpu const& event) -> void
275 {
276 if(::omp_in_parallel() != 0)
277 {
278 // wait for all tasks en-queued before the parallel region
279 queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
280 wait(queue);
281 }
282 else
283 wait(queue.m_spQueueImpl->blockingQueue, event);
284 }
285 };
286 } // namespace trait
287
288 //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
289 template<>
291 {
292 };
293} // namespace alpaka
294
296
297#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
The CPU device handle.
Definition DevCpu.hpp:56
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr< cpu::ICpuQueue > spQueue) const -> void
Registers the given queue on this device. NOTE: Every queue has to be registered for correct function...
Definition DevCpu.hpp:82
The CPU OpenMP 2.0 block accelerator execution task.
Event that can be enqueued into a queue and can be triggered by the Host.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The alpaka accelerator library.
ALPAKA_FN_HOST auto empty(TQueue const &queue) -> bool
Tests if the queue is empty (all ops in the given queue have been completed).
Definition Traits.hpp:58
ALPAKA_FN_HOST auto enqueue(TQueue &queue, TTask &&task) -> void
Queues the given task in the given queue.
Definition Traits.hpp:47
EventGenericThreads< DevCpu > EventCpu
Definition EventCpu.hpp:12
ALPAKA_FN_HOST auto wait(TAwaited const &awaited) -> void
Waits the thread for the completion of the given awaited action to complete.
Definition Traits.hpp:34
STL namespace.
The CPU collective device queue.
std::shared_ptr< cpu::detail::QueueCpuOmp2CollectiveImpl > m_spQueueImpl
auto operator==(QueueCpuOmp2Collective const &rhs) const -> bool
auto operator!=(QueueCpuOmp2Collective const &rhs) const -> bool
The CPU collective device queue implementation.
DevCpu const m_dev
The device this queue is bound to.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
The blocking queue trait.
Definition Queue.hpp:58
static ALPAKA_FN_HOST auto currentThreadWaitFor(QueueCpuOmp2Collective const &queue) -> void
The thread wait trait.
Definition Traits.hpp:21
The device type trait.
Definition Traits.hpp:23
static ALPAKA_FN_HOST auto empty(QueueCpuOmp2Collective const &queue) -> bool
The queue empty trait.
Definition Traits.hpp:31
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &queue, EventCpu &event) -> void
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &queue, TTask const &task) -> void
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &queue, TaskKernelCpuOmp2Blocks< TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &, test::EventHostManualTriggerCpu<> &) -> void
static ALPAKA_FN_HOST auto enqueue(cpu::detail::QueueCpuOmp2CollectiveImpl &, EventCpu &) -> void
The queue enqueue trait.
Definition Traits.hpp:27
The event type trait.
Definition Traits.hpp:17
static ALPAKA_FN_HOST auto getDev(QueueCpuOmp2Collective const &queue) -> DevCpu
The device get trait.
Definition Traits.hpp:27
static ALPAKA_FN_HOST auto waiterWaitFor(QueueCpuOmp2Collective &queue, EventCpu const &event) -> void
static ALPAKA_FN_HOST auto waiterWaitFor(cpu::detail::QueueCpuOmp2CollectiveImpl &, EventCpu const &) -> void
The waiter wait trait.
Definition Traits.hpp:25