alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelCpuThreads.hpp
Go to the documentation of this file.
1/* Copyright 2023 Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7// Specialized traits.
10#include "alpaka/dim/Traits.hpp"
11#include "alpaka/idx/Traits.hpp"
13
14// Implementation details.
17#include "alpaka/core/Decay.hpp"
19#include "alpaka/dev/DevCpu.hpp"
25
26#include <algorithm>
27#include <functional>
28#include <future>
29#include <thread>
30#include <tuple>
31#include <type_traits>
32#include <vector>
33#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
34# include <iostream>
35#endif
36
37#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
38
39namespace alpaka
40{
41 //! The CPU threads execution task.
42 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
43 class TaskKernelCpuThreads final : public WorkDivMembers<TDim, TIdx>
44 {
45 private:
46 // When using the thread pool the threads are yielding because this is faster.
47 // Using condition variables and going to sleep is very costly for real threads.
48 // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
50
51 public:
52 template<typename TWorkDiv>
53 ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
54 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
55 , m_kernelFnObj(kernelFnObj)
56 , m_args(std::forward<TArgs>(
57 args)...) // FIXME(bgruber): this does not forward, since TArgs is not a deduced template parameter
58 {
59 static_assert(
60 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
61 "The work division and the execution task have to be of the same dimensionality!");
62 }
63
64 //! Executes the kernel function object.
65 ALPAKA_FN_HOST auto operator()() const -> void
66 {
68
69 std::apply([&](auto const&... args) { runWithArgs(args...); }, m_args);
70 }
71
72 private:
73 ALPAKA_FN_HOST auto runWithArgs(std::decay_t<TArgs> const&... args) const -> void
74 {
75 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
76 auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
77 auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
78
79 // Get the size of the block shared dynamic memory.
80 auto const smBytes = getBlockSharedMemDynSizeBytes<AccCpuThreads<TDim, TIdx>>(
81 m_kernelFnObj,
82 blockThreadExtent,
83 threadElemExtent,
84 args...);
85# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
86 std::cout << __func__ << " smBytes: " << smBytes << " B" << std::endl;
87# endif
88 AccCpuThreads<TDim, TIdx> acc(*static_cast<WorkDivMembers<TDim, TIdx> const*>(this), smBytes);
89
90 auto const threadsPerBlock = blockThreadExtent.prod();
91 ThreadPool threadPool(static_cast<std::size_t>(threadsPerBlock));
92
93 // Execute the blocks serially.
95 gridBlockExtent,
96 [&](Vec<TDim, TIdx> const& gridBlockIdx)
97 { runBlock(acc, gridBlockIdx, blockThreadExtent, threadPool, m_kernelFnObj, args...); });
98 }
99
100 //! The function executed for each grid block.
101 ALPAKA_FN_HOST static auto runBlock(
102 AccCpuThreads<TDim, TIdx>& acc,
103 Vec<TDim, TIdx> const& gridBlockIdx,
104 Vec<TDim, TIdx> const& blockThreadExtent,
105 ThreadPool& threadPool,
106 TKernelFnObj const& kernelFnObj,
107 std::decay_t<TArgs> const&... args) -> void
108 {
109 std::vector<std::future<void>> futuresInBlock;
110 acc.m_gridBlockIdx = gridBlockIdx;
111
112 // Execute the threads of this block in parallel.
114 blockThreadExtent,
115 [&](Vec<TDim, TIdx> const& blockThreadIdx)
116 {
117 // copy blockThreadIdx because it will get changed for the next iteration/thread.
118 futuresInBlock.emplace_back(threadPool.enqueueTask(
119 [&, blockThreadIdx] { runThread(acc, blockThreadIdx, kernelFnObj, args...); }));
120 });
121
122 // Wait for the completion of the block thread kernels.
123 for(auto& t : futuresInBlock)
124 t.wait();
125
126 // Clean up.
127 futuresInBlock.clear();
128 acc.m_threadToIndexMap.clear();
129 freeSharedVars(acc); // After a block has been processed, the shared memory has to be deleted.
130 }
131
132 //! The thread entry point on the accelerator.
133 ALPAKA_FN_HOST static auto runThread(
134 AccCpuThreads<TDim, TIdx>& acc,
135 Vec<TDim, TIdx> const& blockThreadIdx,
136 TKernelFnObj const& kernelFnObj,
137 std::decay_t<TArgs> const&... args) -> void
138 {
139 // We have to store the thread data before the kernel is calling any of the methods of this class depending
140 // on them.
141 auto const threadId = std::this_thread::get_id();
142
143 if(blockThreadIdx.sum() == 0)
144 {
145 acc.m_idMasterThread = threadId;
146 }
147
148 {
149 // Save the thread id, and index.
150 std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
151 acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
152 }
153
154 // Sync all threads so that the maps with thread id's are complete and not changed after here.
155 syncBlockThreads(acc);
156
157 // Execute the kernel itself.
158 kernelFnObj(std::as_const(acc), args...);
159
160 // We have to sync all threads here because if a thread would finish before all threads have been started,
161 // a new thread could get the recycled (then duplicate) thread id!
162 syncBlockThreads(acc);
163 }
164
165 TKernelFnObj m_kernelFnObj;
166 std::tuple<std::decay_t<TArgs>...> m_args;
167 };
168
169 namespace trait
170 {
171 //! The CPU threads execution task accelerator type trait specialization.
172 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
173 struct AccType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
174 {
175 using type = AccCpuThreads<TDim, TIdx>;
176 };
177
178 //! The CPU threads execution task device type trait specialization.
179 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
180 struct DevType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
181 {
182 using type = DevCpu;
183 };
184
185 //! The CPU threads execution task dimension getter trait specialization.
186 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
187 struct DimType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
188 {
189 using type = TDim;
190 };
191
192 //! The CPU threads execution task platform type trait specialization.
193 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
194 struct PlatformType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
195 {
196 using type = PlatformCpu;
197 };
198
199 //! The CPU threads execution task idx type trait specialization.
200 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
201 struct IdxType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
202 {
203 using type = TIdx;
204 };
205
206 //! \brief Specialisation of the class template FunctionAttributes
207 //! \tparam TDev The device type.
208 //! \tparam TDim The dimensionality of the accelerator device properties.
209 //! \tparam TIdx The idx type of the accelerator device properties.
210 //! \tparam TKernelFn Kernel function object type.
211 //! \tparam TArgs Kernel function object argument types as a parameter pack.
212 template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
213 struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
214 {
215 //! \param dev The device instance
216 //! \param kernelFn The kernel function object which should be executed.
217 //! \param args The kernel invocation arguments.
218 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
219 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
221 TDev const& dev,
222 [[maybe_unused]] TKernelFn const& kernelFn,
223 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
224 {
225 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
226
227 // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
228 // properties function.
229 auto const& props = alpaka::getAccDevProps<AccCpuThreads<TDim, TIdx>>(dev);
230 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
231 kernelFunctionAttributes.maxDynamicSharedSizeBytes
232 = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
233 return kernelFunctionAttributes;
234 }
235 };
236
237 } // namespace trait
238} // namespace alpaka
239
240#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
The CPU threads accelerator.
The CPU threads execution task.
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A n-dimensional vector.
Definition Vec.hpp:38
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const &extent, TFnObj const &f) -> void
Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration....
Definition NdLoop.hpp:81
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const &blockSync) -> void
Synchronizes all threads within the current block (independently for all blocks).
Definition Traits.hpp:36
ALPAKA_FN_HOST auto wait(TAwaited const &awaited) -> void
Waits the thread for the completion of the given awaited action to complete.
Definition Traits.hpp:34
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition Traits.hpp:54
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
A thread pool yielding when there is not enough work to be done.
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFnObj const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
Definition Traits.hpp:85