alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelCpuOmp2Threads.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bert Wesarg, RenĂ© Widera, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Specialized traits.
8 #include "alpaka/acc/Traits.hpp"
9 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/dim/Traits.hpp"
11 #include "alpaka/idx/Traits.hpp"
13 
14 // Implementation details.
16 #include "alpaka/core/Decay.hpp"
17 #include "alpaka/dev/DevCpu.hpp"
18 #include "alpaka/kernel/Traits.hpp"
19 #include "alpaka/meta/NdLoop.hpp"
21 
22 #include <functional>
23 #include <stdexcept>
24 #include <tuple>
25 #include <type_traits>
26 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
27 # include <iostream>
28 #endif
29 
30 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
31 
32 # if _OPENMP < 200203
33 # error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
34 # endif
35 
36 # include <omp.h>
37 
38 namespace alpaka
39 {
40  //! The CPU OpenMP 2.0 thread accelerator execution task.
41  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
42  class TaskKernelCpuOmp2Threads final : public WorkDivMembers<TDim, TIdx>
43  {
44  public:
45  template<typename TWorkDiv>
46  ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
47  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
48  , m_kernelFnObj(kernelFnObj)
49  , m_args(std::forward<TArgs>(args)...)
50  {
51  static_assert(
52  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
53  "The work division and the execution task have to be of the same dimensionality!");
54  }
55 
56  //! Executes the kernel function object.
57  ALPAKA_FN_HOST auto operator()() const -> void
58  {
60 
61  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
62  auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
63  auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
64 
65  // Get the size of the block shared dynamic memory.
66  auto const blockSharedMemDynSizeBytes = std::apply(
67  [&](std::decay_t<TArgs> const&... args)
68  {
69  return getBlockSharedMemDynSizeBytes<AccCpuOmp2Threads<TDim, TIdx>>(
70  m_kernelFnObj,
71  blockThreadExtent,
72  threadElemExtent,
73  args...);
74  },
75  m_args);
76 
77 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
78  std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
79  << std::endl;
80 # endif
81 
83  *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
84  blockSharedMemDynSizeBytes);
85 
86  // The number of threads in this block.
87  TIdx const blockThreadCount(blockThreadExtent.prod());
88  [[maybe_unused]] int const iBlockThreadCount(static_cast<int>(blockThreadCount));
89 
90  if(::omp_in_parallel() != 0)
91  {
92  throw std::runtime_error(
93  "The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
94  }
95 
96  // Force the environment to use the given number of threads.
97  int const ompIsDynamic(::omp_get_dynamic());
98  ::omp_set_dynamic(0);
99 
100  // Execute the blocks serially.
102  gridBlockExtent,
103  [&](Vec<TDim, TIdx> const& gridBlockIdx)
104  {
105  acc.m_gridBlockIdx = gridBlockIdx;
106 
107 // Execute the threads in parallel.
108 
109 // Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to
110 // be done with their work up to this line. So we have to spawn one OS thread per thread in a block. 'omp for' is not
111 // useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1
112 // mapping is required. Therefore we use 'omp parallel' with the specified number of threads in a block.
113 # pragma omp parallel num_threads(iBlockThreadCount)
114  {
115  // The guard is for gcc internal compiler error, as discussed in #735
116  if constexpr((!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0)))
117  {
118 # pragma omp single nowait
119  {
120  // The OpenMP runtime does not create a parallel region when only one thread is
121  // required in the num_threads clause. In all other cases we expect to be in a parallel
122  // region now.
123  if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
124  {
125  throw std::runtime_error(
126  "The OpenMP 2.0 runtime did not create a parallel region!");
127  }
128 
129  int const numThreads = ::omp_get_num_threads();
130  if(numThreads != iBlockThreadCount)
131  {
132  throw std::runtime_error(
133  "The OpenMP 2.0 runtime did not use the number of threads "
134  "that had been required!");
135  }
136  }
137  }
138 
139  std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
140 
141  // Wait for all threads to finish before deleting the shared memory.
142  // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
143  // syncBlockThreads(acc);
144  }
145 
146  // After a block has been processed, the shared memory has to be deleted.
147  freeSharedVars(acc);
148  });
149 
150  // Reset the dynamic thread number setting.
151  ::omp_set_dynamic(ompIsDynamic);
152  }
153 
154  private:
155  TKernelFnObj m_kernelFnObj;
156  std::tuple<std::decay_t<TArgs>...> m_args;
157  };
158 
159  namespace trait
160  {
161  //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
162  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
163  struct AccType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
164  {
166  };
167 
168  //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
169  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
170  struct DevType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
171  {
172  using type = DevCpu;
173  };
174 
175  //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
176  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
177  struct DimType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
178  {
179  using type = TDim;
180  };
181 
182  //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
183  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
184  struct PlatformType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
185  {
186  using type = PlatformCpu;
187  };
188 
189  //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
190  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
191  struct IdxType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
192  {
193  using type = TIdx;
194  };
195  } // namespace trait
196 } // namespace alpaka
197 
198 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU OpenMP 2.0 thread accelerator.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU OpenMP 2.0 thread accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const &extent, TFnObj const &f) -> void
Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration....
Definition: NdLoop.hpp:81
The alpaka accelerator library.
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition: Traits.hpp:54
The CPU device platform.
Definition: PlatformCpu.hpp:18
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30