alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelCpuThreads.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Benjamin Worpitz, RenĂ© Widera, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Specialized traits.
8 #include "alpaka/acc/Traits.hpp"
9 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/dim/Traits.hpp"
11 #include "alpaka/idx/Traits.hpp"
13 
14 // Implementation details.
17 #include "alpaka/core/Decay.hpp"
19 #include "alpaka/dev/DevCpu.hpp"
21 #include "alpaka/kernel/Traits.hpp"
22 #include "alpaka/meta/NdLoop.hpp"
25 
26 #include <algorithm>
27 #include <functional>
28 #include <future>
29 #include <thread>
30 #include <tuple>
31 #include <type_traits>
32 #include <vector>
33 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
34 # include <iostream>
35 #endif
36 
37 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
38 
39 namespace alpaka
40 {
41  //! The CPU threads execution task.
42  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
43  class TaskKernelCpuThreads final : public WorkDivMembers<TDim, TIdx>
44  {
45  private:
46  // When using the thread pool the threads are yielding because this is faster.
47  // Using condition variables and going to sleep is very costly for real threads.
48  // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
50 
51  public:
52  template<typename TWorkDiv>
53  ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
54  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
55  , m_kernelFnObj(kernelFnObj)
56  , m_args(std::forward<TArgs>(
57  args)...) // FIXME(bgruber): this does not forward, since TArgs is not a deduced template parameter
58  {
59  static_assert(
60  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
61  "The work division and the execution task have to be of the same dimensionality!");
62  }
63 
64  //! Executes the kernel function object.
65  ALPAKA_FN_HOST auto operator()() const -> void
66  {
68 
69  std::apply([&](auto const&... args) { runWithArgs(args...); }, m_args);
70  }
71 
72  private:
73  ALPAKA_FN_HOST auto runWithArgs(std::decay_t<TArgs> const&... args) const -> void
74  {
75  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
76  auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
77  auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
78 
79  // Get the size of the block shared dynamic memory.
80  auto const smBytes = getBlockSharedMemDynSizeBytes<AccCpuThreads<TDim, TIdx>>(
81  m_kernelFnObj,
82  blockThreadExtent,
83  threadElemExtent,
84  args...);
85 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
86  std::cout << __func__ << " smBytes: " << smBytes << " B" << std::endl;
87 # endif
88  AccCpuThreads<TDim, TIdx> acc(*static_cast<WorkDivMembers<TDim, TIdx> const*>(this), smBytes);
89 
90  auto const threadsPerBlock = blockThreadExtent.prod();
91  ThreadPool threadPool(static_cast<std::size_t>(threadsPerBlock));
92 
93  // Execute the blocks serially.
95  gridBlockExtent,
96  [&](Vec<TDim, TIdx> const& gridBlockIdx)
97  { runBlock(acc, gridBlockIdx, blockThreadExtent, threadPool, m_kernelFnObj, args...); });
98  }
99 
100  //! The function executed for each grid block.
101  ALPAKA_FN_HOST static auto runBlock(
102  AccCpuThreads<TDim, TIdx>& acc,
103  Vec<TDim, TIdx> const& gridBlockIdx,
104  Vec<TDim, TIdx> const& blockThreadExtent,
105  ThreadPool& threadPool,
106  TKernelFnObj const& kernelFnObj,
107  std::decay_t<TArgs> const&... args) -> void
108  {
109  std::vector<std::future<void>> futuresInBlock;
110  acc.m_gridBlockIdx = gridBlockIdx;
111 
112  // Execute the threads of this block in parallel.
114  blockThreadExtent,
115  [&](Vec<TDim, TIdx> const& blockThreadIdx)
116  {
117  // copy blockThreadIdx because it will get changed for the next iteration/thread.
118  futuresInBlock.emplace_back(threadPool.enqueueTask(
119  [&, blockThreadIdx] { runThread(acc, blockThreadIdx, kernelFnObj, args...); }));
120  });
121 
122  // Wait for the completion of the block thread kernels.
123  for(auto& t : futuresInBlock)
124  t.wait();
125 
126  // Clean up.
127  futuresInBlock.clear();
128  acc.m_threadToIndexMap.clear();
129  freeSharedVars(acc); // After a block has been processed, the shared memory has to be deleted.
130  }
131 
132  //! The thread entry point on the accelerator.
133  ALPAKA_FN_HOST static auto runThread(
134  AccCpuThreads<TDim, TIdx>& acc,
135  Vec<TDim, TIdx> const& blockThreadIdx,
136  TKernelFnObj const& kernelFnObj,
137  std::decay_t<TArgs> const&... args) -> void
138  {
139  // We have to store the thread data before the kernel is calling any of the methods of this class depending
140  // on them.
141  auto const threadId = std::this_thread::get_id();
142 
143  if(blockThreadIdx.sum() == 0)
144  {
145  acc.m_idMasterThread = threadId;
146  }
147 
148  {
149  // Save the thread id, and index.
150  std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
151  acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
152  }
153 
154  // Sync all threads so that the maps with thread id's are complete and not changed after here.
155  syncBlockThreads(acc);
156 
157  // Execute the kernel itself.
158  kernelFnObj(std::as_const(acc), args...);
159 
160  // We have to sync all threads here because if a thread would finish before all threads have been started,
161  // a new thread could get the recycled (then duplicate) thread id!
162  syncBlockThreads(acc);
163  }
164 
165  TKernelFnObj m_kernelFnObj;
166  std::tuple<std::decay_t<TArgs>...> m_args;
167  };
168 
169  namespace trait
170  {
171  //! The CPU threads execution task accelerator type trait specialization.
172  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
173  struct AccType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
174  {
176  };
177 
178  //! The CPU threads execution task device type trait specialization.
179  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
180  struct DevType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
181  {
182  using type = DevCpu;
183  };
184 
185  //! The CPU threads execution task dimension getter trait specialization.
186  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
187  struct DimType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
188  {
189  using type = TDim;
190  };
191 
192  //! The CPU threads execution task platform type trait specialization.
193  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
194  struct PlatformType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
195  {
196  using type = PlatformCpu;
197  };
198 
199  //! The CPU threads execution task idx type trait specialization.
200  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
201  struct IdxType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
202  {
203  using type = TIdx;
204  };
205 
206  //! \brief Specialisation of the class template FunctionAttributes
207  //! \tparam TDev The device type.
208  //! \tparam TDim The dimensionality of the accelerator device properties.
209  //! \tparam TIdx The idx type of the accelerator device properties.
210  //! \tparam TKernelFn Kernel function object type.
211  //! \tparam TArgs Kernel function object argument types as a parameter pack.
212  template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
213  struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
214  {
215  //! \param dev The device instance
216  //! \param kernelFn The kernel function object which should be executed.
217  //! \param args The kernel invocation arguments.
218  //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
219  //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
221  TDev const& dev,
222  [[maybe_unused]] TKernelFn const& kernelFn,
223  [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
224  {
225  alpaka::KernelFunctionAttributes kernelFunctionAttributes;
226 
227  // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
228  // properties function.
229  auto const& props = alpaka::getAccDevProps<AccCpuThreads<TDim, TIdx>>(dev);
230  kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
231  kernelFunctionAttributes.maxDynamicSharedSizeBytes
232  = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
233  return kernelFunctionAttributes;
234  }
235  };
236 
237  } // namespace trait
238 } // namespace alpaka
239 
240 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU threads accelerator.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU threads execution task.
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const &extent, TFnObj const &f) -> void
Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration....
Definition: NdLoop.hpp:81
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const &blockSync) -> void
Synchronizes all threads within the current block (independently for all blocks).
Definition: Traits.hpp:36
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition: Traits.hpp:54
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The CPU device platform.
Definition: PlatformCpu.hpp:18
A thread pool yielding when there is not enough work to be done.
Definition: ThreadPool.hpp:20
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, [[maybe_unused]] TKernelFn const &kernelFn, [[maybe_unused]] TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition: Traits.hpp:79
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30