alpaka
Abstraction Library for Parallel Kernel Acceleration
AccCpuOmp2Threads.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Axel Huebl, Benjamin Worpitz, RenĂ© Widera, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Base classes.
24 
25 // Specialized traits.
26 #include "alpaka/acc/Traits.hpp"
27 #include "alpaka/dev/Traits.hpp"
28 #include "alpaka/idx/Traits.hpp"
29 #include "alpaka/kernel/Traits.hpp"
31 
32 // Implementation details.
33 #include "alpaka/acc/Tag.hpp"
34 #include "alpaka/core/ClipCast.hpp"
35 #include "alpaka/core/Concepts.hpp"
36 #include "alpaka/dev/DevCpu.hpp"
37 
38 #include <limits>
39 #include <typeinfo>
40 
41 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
42 
43 # if _OPENMP < 200203
44 # error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
45 # endif
46 
47 # include <omp.h>
48 
49 namespace alpaka
50 {
51  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
52  class TaskKernelCpuOmp2Threads;
53 
54  //! The CPU OpenMP 2.0 thread accelerator.
55  //!
56  //! This accelerator allows parallel kernel execution on a CPU device.
57  //! It uses OpenMP 2.0 to implement the block thread parallelism.
58  template<typename TDim, typename TIdx>
59  class AccCpuOmp2Threads final
60  : public WorkDivMembers<TDim, TIdx>
61  , public gb::IdxGbRef<TDim, TIdx>
62  , public bt::IdxBtOmp<TDim, TIdx>
63  , public AtomicHierarchy<
64  AtomicCpu, // grid atomics
65  AtomicOmpBuiltIn, // block atomics
66  AtomicOmpBuiltIn> // thread atomics
67  , public math::MathStdLib
68  , public BlockSharedMemDynMember<>
70  , public BlockSyncBarrierOmp
71  , public IntrinsicCpu
72  , public MemFenceOmp2Threads
73 # ifdef ALPAKA_DISABLE_VENDOR_RNG
74  , public rand::RandDefault
75 # else
76  , public rand::RandStdLib
77 # endif
78  , public warp::WarpSingleThread
79  , public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
80  {
81  static_assert(
82  sizeof(TIdx) >= sizeof(int),
83  "Index type is not supported, consider using int or a larger type.");
84 
85  public:
86  // Partial specialization with the correct TDim and TIdx is not allowed.
87  template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
88  friend class ::alpaka::TaskKernelCpuOmp2Threads;
89 
92  auto operator=(AccCpuOmp2Threads const&) -> AccCpuOmp2Threads& = delete;
94 
95  private:
96  template<typename TWorkDiv>
97  ALPAKA_FN_HOST AccCpuOmp2Threads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
98  : WorkDivMembers<TDim, TIdx>(workDiv)
99  , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
100  , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
102  staticMemBegin(),
104  [this]() { syncBlockThreads(*this); },
105  []() noexcept { return (::omp_get_thread_num() == 0); })
106  , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
107  {
108  }
109 
110  private:
111  // getIdx
112  Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
113  };
114 
115  namespace trait
116  {
117  //! The CPU OpenMP 2.0 thread accelerator accelerator type trait specialization.
118  template<typename TDim, typename TIdx>
119  struct AccType<AccCpuOmp2Threads<TDim, TIdx>>
120  {
122  };
123 
124  //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
125  template<typename TDim, typename TIdx>
126  struct GetAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>
127  {
129  {
130 # ifdef ALPAKA_CI
131  auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads()));
132 # else
133  auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(::omp_get_max_threads());
134 # endif
135  auto const memBytes = getMemBytes(dev);
136  return {// m_multiProcessorCount
137  static_cast<TIdx>(1),
138  // m_gridBlockExtentMax
140  // m_gridBlockCountMax
142  // m_blockThreadExtentMax
143  Vec<TDim, TIdx>::all(blockThreadCountMax),
144  // m_blockThreadCountMax
145  blockThreadCountMax,
146  // m_threadElemExtentMax
148  // m_threadElemCountMax
150  // m_sharedMemSizeBytes
151  memBytes,
152  // m_globalMemSizeBytes
153  memBytes};
154  }
155  };
156 
157  //! The CPU OpenMP 2.0 thread accelerator name trait specialization.
158  template<typename TDim, typename TIdx>
159  struct GetAccName<AccCpuOmp2Threads<TDim, TIdx>>
160  {
161  ALPAKA_FN_HOST static auto getAccName() -> std::string
162  {
163  return "AccCpuOmp2Threads<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
164  }
165  };
166 
167  //! The CPU OpenMP 2.0 thread accelerator device type trait specialization.
168  template<typename TDim, typename TIdx>
169  struct DevType<AccCpuOmp2Threads<TDim, TIdx>>
170  {
171  using type = DevCpu;
172  };
173 
174  //! The CPU OpenMP 2.0 thread accelerator dimension getter trait specialization.
175  template<typename TDim, typename TIdx>
176  struct DimType<AccCpuOmp2Threads<TDim, TIdx>>
177  {
178  using type = TDim;
179  };
180 
181  //! The CPU OpenMP 2.0 thread accelerator execution task type trait specialization.
182  template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
183  struct CreateTaskKernel<AccCpuOmp2Threads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
184  {
186  TWorkDiv const& workDiv,
187  TKernelFnObj const& kernelFnObj,
188  TArgs&&... args)
189  {
190  return TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>(
191  workDiv,
192  kernelFnObj,
193  std::forward<TArgs>(args)...);
194  }
195  };
196 
197  //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
198  template<typename TDim, typename TIdx>
199  struct PlatformType<AccCpuOmp2Threads<TDim, TIdx>>
200  {
201  using type = PlatformCpu;
202  };
203 
204  //! The CPU OpenMP 2.0 thread accelerator idx type trait specialization.
205  template<typename TDim, typename TIdx>
206  struct IdxType<AccCpuOmp2Threads<TDim, TIdx>>
207  {
208  using type = TIdx;
209  };
210 
211  template<typename TDim, typename TIdx>
212  struct AccToTag<alpaka::AccCpuOmp2Threads<TDim, TIdx>>
213  {
215  };
216 
217  template<typename TDim, typename TIdx>
218  struct TagToAcc<alpaka::TagCpuOmp2Threads, TDim, TIdx>
219  {
221  };
222  } // namespace trait
223 } // namespace alpaka
224 
225 #endif
The CPU OpenMP 2.0 thread accelerator.
AccCpuOmp2Threads(AccCpuOmp2Threads &&)=delete
AccCpuOmp2Threads(AccCpuOmp2Threads const &)=delete
auto operator=(AccCpuOmp2Threads &&) -> AccCpuOmp2Threads &=delete
auto operator=(AccCpuOmp2Threads const &) -> AccCpuOmp2Threads &=delete
Dynamic block shared memory provider using fixed-size member array to allocate memory on the stack or...
auto staticMemBegin() const -> uint8_t *
auto staticMemCapacity() const -> std::uint32_t
The OpenMP barrier block synchronization.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU intrinsic.
The CPU OpenMP 2.0 block memory fence.
The CPU OpenMP 2.0 thread accelerator execution task.
A n-dimensional vector.
Definition: Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition: Vec.hpp:116
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto zeros() -> Vec< TDim, TIdx >
Zero value constructor.
Definition: Vec.hpp:126
A basic class holding the work division as grid block extent, block thread and thread element extent.
The OpenMP accelerator index provider.
Definition: IdxBtOmp.hpp:26
A IdxGbRef grid block index.
Definition: IdxGbRef.hpp:20
IdxGbRef(Vec< TDim, TIdx > const &gridBlockIdx)
Definition: IdxGbRef.hpp:22
The standard library math trait specializations.
Definition: MathStdLib.hpp:249
"Tiny" state mersenne twister implementation
Definition: RandStdLib.hpp:20
The single-threaded warp to emulate it on CPUs.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto max(T const &max_ctx, Tx const &x, Ty const &y)
Returns the larger of two arguments. NaNs are treated as missing data (between a NaN and a numeric va...
Definition: Traits.hpp:1263
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto min(T const &min_ctx, Tx const &x, Ty const &y)
Returns the smaller of two arguments. NaNs are treated as missing data (between a NaN and a numeric v...
Definition: Traits.hpp:1280
The alpaka accelerator library.
ALPAKA_FN_HOST auto getMemBytes(TDev const &dev) -> std::size_t
Definition: Traits.hpp:95
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const &blockSync) -> void
Synchronizes all threads within the current block (independently for all blocks).
Definition: Traits.hpp:36
The acceleration properties on a device.
Definition: AccDevProps.hpp:18
The CPU device platform.
Definition: PlatformCpu.hpp:18
Tag used in class inheritance hierarchies that describes that a specific concept (TConcept) is implem...
Definition: Concepts.hpp:15
The accelerator type trait.
Definition: Traits.hpp:37
static ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const &workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
The kernel execution task creation trait.
Definition: Traits.hpp:34
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto getAccDevProps(DevCpu const &dev) -> alpaka::AccDevProps< TDim, TIdx >
The device properties get trait.
Definition: Traits.hpp:41
static ALPAKA_FN_HOST auto getAccName() -> std::string
The accelerator name trait.
Definition: Traits.hpp:48
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30