alpaka
Abstraction Library for Parallel Kernel Acceleration
QueueCpuOmp2Collective.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/dev/DevCpu.hpp"
8 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/event/Traits.hpp"
13 #include "alpaka/queue/Traits.hpp"
17 #include "alpaka/wait/Traits.hpp"
18 
19 #include <atomic>
20 #include <mutex>
21 
22 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
23 
24 # if _OPENMP < 200203
25 # error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
26 # endif
27 
28 # include <omp.h>
29 
30 namespace alpaka
31 {
32  namespace cpu::detail
33  {
34 # if BOOST_COMP_CLANG
35 // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
36 // translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
37 # pragma clang diagnostic push
38 # pragma clang diagnostic ignored "-Wweak-vtables"
39 # endif
40  //! The CPU collective device queue implementation.
42 # if BOOST_COMP_CLANG
43 # pragma clang diagnostic pop
44 # endif
45  {
46  explicit QueueCpuOmp2CollectiveImpl(DevCpu const& dev) noexcept : m_dev(dev), blockingQueue(dev)
47  {
48  }
49 
50  void enqueue(EventCpu& ev) final
51  {
52  alpaka::enqueue(*this, ev);
53  }
54 
55  void wait(EventCpu const& ev) final
56  {
57  alpaka::wait(*this, ev);
58  }
59 
61  {
62  while(!empty(blockingQueue))
63  ;
64  }
65 
66  DevCpu const m_dev; //!< The device this queue is bound to.
67  std::mutex mutable m_mutex;
69  std::atomic<uint32_t> m_uCurrentlyExecutingTask = 0;
70  };
71  } // namespace cpu::detail
72 
73  //! The CPU collective device queue.
74  //
75  // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
76  //
77  // This queue is an example for a user provided queue and the behavior is strongly coupled
78  // to the user workflows.
79  //
80  // Within an OpenMP parallel region kernel will be performed collectively.
81  // All other operations will be performed from one thread (it is not defined which thread) and there will be no
82  // implicit synchronization between other operations within the parallel OpenMP parallel region. Operations
83  // executed within a OpenMP parallel region will be executed after already queued tasks before the parallel region
84  // was created.
85  //
86  // Outside of an OpenMP parallel region the queue behaves like QueueCpuBlocking.
87  struct QueueCpuOmp2Collective final : interface::Implements<ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
88  {
89  explicit QueueCpuOmp2Collective(DevCpu const& dev)
90  : m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev))
91  {
93  }
94 
95  auto operator==(QueueCpuOmp2Collective const& rhs) const -> bool
96  {
97  return m_spQueueImpl == rhs.m_spQueueImpl;
98  }
99 
100  auto operator!=(QueueCpuOmp2Collective const& rhs) const -> bool
101  {
102  return !((*this) == rhs);
103  }
104 
105  std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
106  };
107 
108  namespace trait
109  {
110  //! The CPU blocking device queue device type trait specialization.
111  template<>
113  {
114  using type = DevCpu;
115  };
116 
117  //! The CPU blocking device queue device get trait specialization.
118  template<>
120  {
121  ALPAKA_FN_HOST static auto getDev(QueueCpuOmp2Collective const& queue) -> DevCpu
122  {
123  return queue.m_spQueueImpl->m_dev;
124  }
125  };
126 
127  //! The CPU blocking device queue event type trait specialization.
128  template<>
130  {
131  using type = EventCpu;
132  };
133 
134  //! The CPU blocking device queue enqueue trait specialization.
135  //! This default implementation for all tasks directly invokes the function call operator of the task.
136  template<typename TTask>
138  {
139  ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, TTask const& task) -> void
140  {
141  if(::omp_in_parallel() != 0)
142  {
143  // wait for all tasks enqueued before the parallel region
144  queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
145  ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
146 # pragma omp single nowait
147  task();
148  --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
149  }
150  else
151  {
152  std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
153  alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
154  }
155  }
156  };
157 
158  //! The CPU blocking device queue test trait specialization.
159  template<>
161  {
162  ALPAKA_FN_HOST static auto empty(QueueCpuOmp2Collective const& queue) -> bool
163  {
164  return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u
165  && alpaka::empty(queue.m_spQueueImpl->blockingQueue);
166  }
167  };
168 
169  //! The CPU OpenMP2 collective device queue enqueue trait specialization.
170  template<>
171  struct Enqueue<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
172  {
174  {
176 
177 # pragma omp barrier
178  }
179  };
180 
181  //! The CPU OpenMP2 collective device queue enqueue trait specialization.
182  template<>
184  {
185  ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, EventCpu& event) -> void
186  {
188 
189  if(::omp_in_parallel() != 0)
190  {
191  // wait for all tasks en-queued before the parallel region
192  queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
193 # pragma omp barrier
194  }
195  else
196  {
197  alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, event);
198  }
199  }
200  };
201 
202  //! The CPU blocking device queue enqueue trait specialization.
203  //! This default implementation for all tasks directly invokes the function call operator of the task.
204  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
205  struct Enqueue<QueueCpuOmp2Collective, TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
206  {
207  ALPAKA_FN_HOST static auto enqueue(
208  QueueCpuOmp2Collective& queue,
210  {
211  if(::omp_in_parallel() != 0)
212  {
213  queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
214  ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
215  // execute task within an OpenMP parallel region
216  task();
217  --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
218  }
219  else
220  {
221  std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
222  alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
223  }
224  }
225  };
226 
227  template<>
228  struct Enqueue<QueueCpuOmp2Collective, test::EventHostManualTriggerCpu<>>
229  {
231  {
232  // EventHostManualTriggerCpu are not supported for together with the queue
233  // QueueCpuOmp2Collective but a specialization is needed to path the EventTests
234  }
235  };
236 
237  //! The CPU blocking device queue thread wait trait specialization.
238  //!
239  //! Blocks execution of the calling thread until the queue has finished processing all previously requested
240  //! tasks (kernels, data copies, ...)
241  template<>
243  {
245  {
246  if(::omp_in_parallel() != 0)
247  {
248  // wait for all tasks en-queued before the parallel region
249  queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
250 # pragma omp barrier
251  }
252  else
253  {
254  std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
255  wait(queue.m_spQueueImpl->blockingQueue);
256  }
257  }
258  };
259 
260  //! The CPU OpenMP2 collective device queue event wait trait specialization.
261  template<>
262  struct WaiterWaitFor<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
263  {
265  {
266 # pragma omp barrier
267  }
268  };
269 
270  //! The CPU OpenMP2 collective queue event wait trait specialization.
271  template<>
273  {
274  ALPAKA_FN_HOST static auto waiterWaitFor(QueueCpuOmp2Collective& queue, EventCpu const& event) -> void
275  {
276  if(::omp_in_parallel() != 0)
277  {
278  // wait for all tasks en-queued before the parallel region
279  queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
280  wait(queue);
281  }
282  else
283  wait(queue.m_spQueueImpl->blockingQueue, event);
284  }
285  };
286  } // namespace trait
287 
288  //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
289  template<>
291  {
292  };
293 } // namespace alpaka
294 
295 # include "alpaka/event/EventCpu.hpp"
296 
297 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU device handle.
Definition: DevCpu.hpp:56
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr< cpu::ICpuQueue > spQueue) const -> void
Registers the given queue on this device. NOTE: Every queue has to be registered for correct function...
Definition: DevCpu.hpp:82
The CPU OpenMP 2.0 block accelerator execution task.
Event that can be enqueued into a queue and can be triggered by the Host.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
The alpaka accelerator library.
ALPAKA_FN_HOST auto empty(TQueue const &queue) -> bool
Tests if the queue is empty (all ops in the given queue have been completed).
Definition: Traits.hpp:58
ALPAKA_FN_HOST auto enqueue(TQueue &queue, TTask &&task) -> void
Queues the given task in the given queue.
Definition: Traits.hpp:47
EventGenericThreads< DevCpu > EventCpu
Definition: EventCpu.hpp:12
ALPAKA_FN_HOST auto wait(TAwaited const &awaited) -> void
Waits the thread for the completion of the given awaited action to complete.
Definition: Traits.hpp:34
The CPU collective device queue.
std::shared_ptr< cpu::detail::QueueCpuOmp2CollectiveImpl > m_spQueueImpl
auto operator==(QueueCpuOmp2Collective const &rhs) const -> bool
auto operator!=(QueueCpuOmp2Collective const &rhs) const -> bool
The CPU collective device queue implementation.
DevCpu const m_dev
The device this queue is bound to.
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition: Interface.hpp:15
The blocking queue trait.
Definition: Queue.hpp:58
static ALPAKA_FN_HOST auto currentThreadWaitFor(QueueCpuOmp2Collective const &queue) -> void
The thread wait trait.
Definition: Traits.hpp:21
The device type trait.
Definition: Traits.hpp:23
static ALPAKA_FN_HOST auto empty(QueueCpuOmp2Collective const &queue) -> bool
The queue empty trait.
Definition: Traits.hpp:31
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &queue, EventCpu &event) -> void
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &queue, TTask const &task) -> void
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &queue, TaskKernelCpuOmp2Blocks< TDim, TIdx, TKernelFnObj, TArgs... > const &task) -> void
static ALPAKA_FN_HOST auto enqueue(QueueCpuOmp2Collective &, test::EventHostManualTriggerCpu<> &) -> void
static ALPAKA_FN_HOST auto enqueue(cpu::detail::QueueCpuOmp2CollectiveImpl &, EventCpu &) -> void
The queue enqueue trait.
Definition: Traits.hpp:27
The event type trait.
Definition: Traits.hpp:17
static ALPAKA_FN_HOST auto getDev(QueueCpuOmp2Collective const &queue) -> DevCpu
The device get trait.
Definition: Traits.hpp:27
static ALPAKA_FN_HOST auto waiterWaitFor(QueueCpuOmp2Collective &queue, EventCpu const &event) -> void
static ALPAKA_FN_HOST auto waiterWaitFor(cpu::detail::QueueCpuOmp2CollectiveImpl &, EventCpu const &) -> void
The waiter wait trait.
Definition: Traits.hpp:25