alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelCpuThreads.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Benjamin Worpitz, RenĂ© Widera, Jan Stephan, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Specialized traits.
8 #include "alpaka/acc/Traits.hpp"
9 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/dim/Traits.hpp"
11 #include "alpaka/idx/Traits.hpp"
13 
14 // Implementation details.
17 #include "alpaka/core/Decay.hpp"
19 #include "alpaka/dev/DevCpu.hpp"
20 #include "alpaka/kernel/Traits.hpp"
21 #include "alpaka/meta/NdLoop.hpp"
23 
24 #include <algorithm>
25 #include <functional>
26 #include <future>
27 #include <thread>
28 #include <tuple>
29 #include <type_traits>
30 #include <vector>
31 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
32 # include <iostream>
33 #endif
34 
35 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
36 
37 namespace alpaka
38 {
39  //! The CPU threads execution task.
40  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
41  class TaskKernelCpuThreads final : public WorkDivMembers<TDim, TIdx>
42  {
43  private:
44  // When using the thread pool the threads are yielding because this is faster.
45  // Using condition variables and going to sleep is very costly for real threads.
46  // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
48 
49  public:
50  template<typename TWorkDiv>
51  ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
52  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
53  , m_kernelFnObj(kernelFnObj)
54  , m_args(std::forward<TArgs>(
55  args)...) // FIXME(bgruber): this does not forward, since TArgs is not a deduced template parameter
56  {
57  static_assert(
58  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
59  "The work division and the execution task have to be of the same dimensionality!");
60  }
61 
62  //! Executes the kernel function object.
63  ALPAKA_FN_HOST auto operator()() const -> void
64  {
66 
67  std::apply([&](auto const&... args) { runWithArgs(args...); }, m_args);
68  }
69 
70  private:
71  ALPAKA_FN_HOST auto runWithArgs(std::decay_t<TArgs> const&... args) const -> void
72  {
73  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
74  auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
75  auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
76 
77  // Get the size of the block shared dynamic memory.
78  auto const smBytes = getBlockSharedMemDynSizeBytes<AccCpuThreads<TDim, TIdx>>(
79  m_kernelFnObj,
80  blockThreadExtent,
81  threadElemExtent,
82  args...);
83 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
84  std::cout << __func__ << " smBytes: " << smBytes << " B" << std::endl;
85 # endif
86  AccCpuThreads<TDim, TIdx> acc(*static_cast<WorkDivMembers<TDim, TIdx> const*>(this), smBytes);
87 
88  auto const threadsPerBlock = blockThreadExtent.prod();
89  ThreadPool threadPool(static_cast<std::size_t>(threadsPerBlock));
90 
91  // Execute the blocks serially.
93  gridBlockExtent,
94  [&](Vec<TDim, TIdx> const& gridBlockIdx)
95  { runBlock(acc, gridBlockIdx, blockThreadExtent, threadPool, m_kernelFnObj, args...); });
96  }
97 
98  //! The function executed for each grid block.
99  ALPAKA_FN_HOST static auto runBlock(
100  AccCpuThreads<TDim, TIdx>& acc,
101  Vec<TDim, TIdx> const& gridBlockIdx,
102  Vec<TDim, TIdx> const& blockThreadExtent,
103  ThreadPool& threadPool,
104  TKernelFnObj const& kernelFnObj,
105  std::decay_t<TArgs> const&... args) -> void
106  {
107  std::vector<std::future<void>> futuresInBlock;
108  acc.m_gridBlockIdx = gridBlockIdx;
109 
110  // Execute the threads of this block in parallel.
112  blockThreadExtent,
113  [&](Vec<TDim, TIdx> const& blockThreadIdx)
114  {
115  // copy blockThreadIdx because it will get changed for the next iteration/thread.
116  futuresInBlock.emplace_back(threadPool.enqueueTask(
117  [&, blockThreadIdx] { runThread(acc, blockThreadIdx, kernelFnObj, args...); }));
118  });
119 
120  // Wait for the completion of the block thread kernels.
121  for(auto& t : futuresInBlock)
122  t.wait();
123 
124  // Clean up.
125  futuresInBlock.clear();
126  acc.m_threadToIndexMap.clear();
127  freeSharedVars(acc); // After a block has been processed, the shared memory has to be deleted.
128  }
129 
130  //! The thread entry point on the accelerator.
131  ALPAKA_FN_HOST static auto runThread(
132  AccCpuThreads<TDim, TIdx>& acc,
133  Vec<TDim, TIdx> const& blockThreadIdx,
134  TKernelFnObj const& kernelFnObj,
135  std::decay_t<TArgs> const&... args) -> void
136  {
137  // We have to store the thread data before the kernel is calling any of the methods of this class depending
138  // on them.
139  auto const threadId = std::this_thread::get_id();
140 
141  if(blockThreadIdx.sum() == 0)
142  {
143  acc.m_idMasterThread = threadId;
144  }
145 
146  {
147  // Save the thread id, and index.
148  std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
149  acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
150  }
151 
152  // Sync all threads so that the maps with thread id's are complete and not changed after here.
153  syncBlockThreads(acc);
154 
155  // Execute the kernel itself.
156  kernelFnObj(std::as_const(acc), args...);
157 
158  // We have to sync all threads here because if a thread would finish before all threads have been started,
159  // a new thread could get the recycled (then duplicate) thread id!
160  syncBlockThreads(acc);
161  }
162 
163  TKernelFnObj m_kernelFnObj;
164  std::tuple<std::decay_t<TArgs>...> m_args;
165  };
166 
167  namespace trait
168  {
169  //! The CPU threads execution task accelerator type trait specialization.
170  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
171  struct AccType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
172  {
174  };
175 
176  //! The CPU threads execution task device type trait specialization.
177  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
178  struct DevType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
179  {
180  using type = DevCpu;
181  };
182 
183  //! The CPU threads execution task dimension getter trait specialization.
184  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
185  struct DimType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
186  {
187  using type = TDim;
188  };
189 
190  //! The CPU threads execution task platform type trait specialization.
191  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
192  struct PlatformType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
193  {
194  using type = PlatformCpu;
195  };
196 
197  //! The CPU threads execution task idx type trait specialization.
198  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
199  struct IdxType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
200  {
201  using type = TIdx;
202  };
203  } // namespace trait
204 } // namespace alpaka
205 
206 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU threads accelerator.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU threads execution task.
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const &extent, TFnObj const &f) -> void
Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration....
Definition: NdLoop.hpp:81
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const &blockSync) -> void
Synchronizes all threads within the current block (independently for all blocks).
Definition: Traits.hpp:36
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition: Traits.hpp:54
The CPU device platform.
Definition: PlatformCpu.hpp:18
A thread pool yielding when there is not enough work to be done.
Definition: ThreadPool.hpp:20
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30