alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelCpuOmp2Threads.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bert Wesarg, RenĂ© Widera, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Specialized traits.
8 #include "alpaka/acc/Traits.hpp"
9 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/dim/Traits.hpp"
11 #include "alpaka/idx/Traits.hpp"
13 
14 // Implementation details.
16 #include "alpaka/core/Decay.hpp"
17 #include "alpaka/dev/DevCpu.hpp"
19 #include "alpaka/kernel/Traits.hpp"
20 #include "alpaka/meta/NdLoop.hpp"
23 
24 #include <functional>
25 #include <stdexcept>
26 #include <tuple>
27 #include <type_traits>
28 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
29 # include <iostream>
30 #endif
31 
32 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
33 
34 # if _OPENMP < 200203
35 # error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
36 # endif
37 
38 # include <omp.h>
39 
40 namespace alpaka
41 {
42  //! The CPU OpenMP 2.0 thread accelerator execution task.
43  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
44  class TaskKernelCpuOmp2Threads final : public WorkDivMembers<TDim, TIdx>
45  {
46  public:
47  template<typename TWorkDiv>
48  ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
49  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
50  , m_kernelFnObj(kernelFnObj)
51  , m_args(std::forward<TArgs>(args)...)
52  {
53  static_assert(
54  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
55  "The work division and the execution task have to be of the same dimensionality!");
56  }
57 
58  //! Executes the kernel function object.
59  ALPAKA_FN_HOST auto operator()() const -> void
60  {
62 
63  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
64  auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
65  auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
66 
67  // Get the size of the block shared dynamic memory.
68  auto const blockSharedMemDynSizeBytes = std::apply(
69  [&](std::decay_t<TArgs> const&... args)
70  {
71  return getBlockSharedMemDynSizeBytes<AccCpuOmp2Threads<TDim, TIdx>>(
72  m_kernelFnObj,
73  blockThreadExtent,
74  threadElemExtent,
75  args...);
76  },
77  m_args);
78 
79 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
80  std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
81  << std::endl;
82 # endif
83 
85  *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
86  blockSharedMemDynSizeBytes);
87 
88  // The number of threads in this block.
89  TIdx const blockThreadCount(blockThreadExtent.prod());
90  [[maybe_unused]] int const iBlockThreadCount(static_cast<int>(blockThreadCount));
91 
92  if(::omp_in_parallel() != 0)
93  {
94  throw std::runtime_error(
95  "The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
96  }
97 
98  // Force the environment to use the given number of threads.
99  int const ompIsDynamic(::omp_get_dynamic());
100  ::omp_set_dynamic(0);
101 
102  // Execute the blocks serially.
104  gridBlockExtent,
105  [&](Vec<TDim, TIdx> const& gridBlockIdx)
106  {
107  acc.m_gridBlockIdx = gridBlockIdx;
108 
109 // Execute the threads in parallel.
110 
111 // Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to
112 // be done with their work up to this line. So we have to spawn one OS thread per thread in a block. 'omp for' is not
113 // useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1
114 // mapping is required. Therefore we use 'omp parallel' with the specified number of threads in a block.
115 # pragma omp parallel num_threads(iBlockThreadCount)
116  {
117  // The guard is for gcc internal compiler error, as discussed in #735
118  if constexpr((!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0)))
119  {
120 # pragma omp single nowait
121  {
122  // The OpenMP runtime does not create a parallel region when only one thread is
123  // required in the num_threads clause. In all other cases we expect to be in a parallel
124  // region now.
125  if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
126  {
127  throw std::runtime_error(
128  "The OpenMP 2.0 runtime did not create a parallel region!");
129  }
130 
131  int const numThreads = ::omp_get_num_threads();
132  if(numThreads != iBlockThreadCount)
133  {
134  throw std::runtime_error(
135  "The OpenMP 2.0 runtime did not use the number of threads "
136  "that had been required!");
137  }
138  }
139  }
140 
141  std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
142 
143  // Wait for all threads to finish before deleting the shared memory.
144  // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
145  // syncBlockThreads(acc);
146  }
147 
148  // After a block has been processed, the shared memory has to be deleted.
149  freeSharedVars(acc);
150  });
151 
152  // Reset the dynamic thread number setting.
153  ::omp_set_dynamic(ompIsDynamic);
154  }
155 
156  private:
157  TKernelFnObj m_kernelFnObj;
158  std::tuple<std::decay_t<TArgs>...> m_args;
159  };
160 
161  namespace trait
162  {
163  //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
164  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
165  struct AccType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
166  {
168  };
169 
170  //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
171  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
172  struct DevType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
173  {
174  using type = DevCpu;
175  };
176 
177  //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
178  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
179  struct DimType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
180  {
181  using type = TDim;
182  };
183 
184  //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
185  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
186  struct PlatformType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
187  {
188  using type = PlatformCpu;
189  };
190 
191  //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
192  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
193  struct IdxType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
194  {
195  using type = TIdx;
196  };
197 
198  //! \brief Specialisation of the class template FunctionAttributes
199  //! \tparam TDev The device type.
200  //! \tparam TDim The dimensionality of the accelerator device properties.
201  //! \tparam TIdx The idx type of the accelerator device properties.
202  //! \tparam TKernelFn Kernel function object type.
203  //! \tparam TArgs Kernel function object argument types as a parameter pack.
204  template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
205  struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
206  {
207  //! \param dev The device instance
208  //! \param kernelFn The kernel function object which should be executed.
209  //! \param args The kernel invocation arguments.
210  //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
211  //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
213  TDev const& dev,
214  [[maybe_unused]] TKernelFn const& kernelFn,
215  [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
216  {
217  alpaka::KernelFunctionAttributes kernelFunctionAttributes;
218 
219  // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
220  // properties function.
221  auto const& props = alpaka::getAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>(dev);
222  kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
223  kernelFunctionAttributes.maxDynamicSharedSizeBytes
224  = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
225  return kernelFunctionAttributes;
226  }
227  };
228 
229  } // namespace trait
230 } // namespace alpaka
231 
232 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU OpenMP 2.0 thread accelerator.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU OpenMP 2.0 thread accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const &extent, TFnObj const &f) -> void
Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration....
Definition: NdLoop.hpp:81
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition: Traits.hpp:54
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The CPU device platform.
Definition: PlatformCpu.hpp:18
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, [[maybe_unused]] TKernelFn const &kernelFn, [[maybe_unused]] TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition: Traits.hpp:79
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30