alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelCpuOmp2Threads.hpp
Go to the documentation of this file.
1/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera, Jan Stephan, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7// Specialized traits.
10#include "alpaka/dim/Traits.hpp"
11#include "alpaka/idx/Traits.hpp"
13
14// Implementation details.
16#include "alpaka/core/Decay.hpp"
17#include "alpaka/dev/DevCpu.hpp"
23
24#include <functional>
25#include <stdexcept>
26#include <tuple>
27#include <type_traits>
28#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
29# include <iostream>
30#endif
31
32#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
33
34# if _OPENMP < 200203
35# error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
36# endif
37
38# include <omp.h>
39
40namespace alpaka
41{
42 //! The CPU OpenMP 2.0 thread accelerator execution task.
43 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
44 class TaskKernelCpuOmp2Threads final : public WorkDivMembers<TDim, TIdx>
45 {
46 public:
47 template<typename TWorkDiv>
48 ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
49 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
50 , m_kernelFnObj(kernelFnObj)
51 , m_args(std::forward<TArgs>(args)...)
52 {
53 static_assert(
54 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
55 "The work division and the execution task have to be of the same dimensionality!");
56 }
57
58 //! Executes the kernel function object.
59 ALPAKA_FN_HOST auto operator()() const -> void
60 {
62
63 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
64 auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
65 auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
66
67 // Get the size of the block shared dynamic memory.
68 auto const blockSharedMemDynSizeBytes = std::apply(
69 [&](std::decay_t<TArgs> const&... args)
70 {
71 return getBlockSharedMemDynSizeBytes<AccCpuOmp2Threads<TDim, TIdx>>(
72 m_kernelFnObj,
73 blockThreadExtent,
74 threadElemExtent,
75 args...);
76 },
77 m_args);
78
79# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
80 std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
81 << std::endl;
82# endif
83
85 *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
86 blockSharedMemDynSizeBytes);
87
88 // The number of threads in this block.
89 TIdx const blockThreadCount(blockThreadExtent.prod());
90 [[maybe_unused]] int const iBlockThreadCount(static_cast<int>(blockThreadCount));
91
92 if(::omp_in_parallel() != 0)
93 {
94 throw std::runtime_error(
95 "The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
96 }
97
98 // Force the environment to use the given number of threads.
99 int const ompIsDynamic(::omp_get_dynamic());
100 ::omp_set_dynamic(0);
101
102 // Execute the blocks serially.
104 gridBlockExtent,
105 [&](Vec<TDim, TIdx> const& gridBlockIdx)
106 {
107 acc.m_gridBlockIdx = gridBlockIdx;
108
109// Execute the threads in parallel.
110
111// Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to
112// be done with their work up to this line. So we have to spawn one OS thread per thread in a block. 'omp for' is not
113// useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1
114// mapping is required. Therefore we use 'omp parallel' with the specified number of threads in a block.
115# pragma omp parallel num_threads(iBlockThreadCount)
116 {
117 // The guard is for gcc internal compiler error, as discussed in #735
118 if constexpr((!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0)))
119 {
120# pragma omp single nowait
121 {
122 // The OpenMP runtime does not create a parallel region when only one thread is
123 // required in the num_threads clause. In all other cases we expect to be in a parallel
124 // region now.
125 if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
126 {
127 throw std::runtime_error(
128 "The OpenMP 2.0 runtime did not create a parallel region!");
129 }
130
131 int const numThreads = ::omp_get_num_threads();
132 if(numThreads != iBlockThreadCount)
133 {
134 throw std::runtime_error(
135 "The OpenMP 2.0 runtime did not use the number of threads "
136 "that had been required!");
137 }
138 }
139 }
140
141 std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
142
143 // Wait for all threads to finish before deleting the shared memory.
144 // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
145 // syncBlockThreads(acc);
146 }
147
148 // After a block has been processed, the shared memory has to be deleted.
149 freeSharedVars(acc);
150 });
151
152 // Reset the dynamic thread number setting.
153 ::omp_set_dynamic(ompIsDynamic);
154 }
155
156 private:
157 TKernelFnObj m_kernelFnObj;
158 std::tuple<std::decay_t<TArgs>...> m_args;
159 };
160
161 namespace trait
162 {
163 //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
164 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
165 struct AccType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
166 {
167 using type = AccCpuOmp2Threads<TDim, TIdx>;
168 };
169
170 //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
171 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
172 struct DevType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
173 {
174 using type = DevCpu;
175 };
176
177 //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
178 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
179 struct DimType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
180 {
181 using type = TDim;
182 };
183
184 //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
185 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
186 struct PlatformType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
187 {
188 using type = PlatformCpu;
189 };
190
191 //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
192 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
193 struct IdxType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
194 {
195 using type = TIdx;
196 };
197
198 //! \brief Specialisation of the class template FunctionAttributes
199 //! \tparam TDev The device type.
200 //! \tparam TDim The dimensionality of the accelerator device properties.
201 //! \tparam TIdx The idx type of the accelerator device properties.
202 //! \tparam TKernelFn Kernel function object type.
203 //! \tparam TArgs Kernel function object argument types as a parameter pack.
204 template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
205 struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
206 {
207 //! \param dev The device instance
208 //! \param kernelFn The kernel function object which should be executed.
209 //! \param args The kernel invocation arguments.
210 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
211 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
213 TDev const& dev,
214 [[maybe_unused]] TKernelFn const& kernelFn,
215 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
216 {
217 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
218
219 // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
220 // properties function.
221 auto const& props = alpaka::getAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>(dev);
222 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
223 kernelFunctionAttributes.maxDynamicSharedSizeBytes
224 = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
225 return kernelFunctionAttributes;
226 }
227 };
228
229 } // namespace trait
230} // namespace alpaka
231
232#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
The CPU OpenMP 2.0 thread accelerator.
The CPU OpenMP 2.0 thread accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A n-dimensional vector.
Definition Vec.hpp:38
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const &extent, TFnObj const &f) -> void
Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration....
Definition NdLoop.hpp:81
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition Traits.hpp:54
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFnObj const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
Definition Traits.hpp:85