alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelCpuOmp2Blocks.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Bert Wesarg, RenĂ© Widera, Sergei Bastrakov, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Specialized traits.
8 #include "alpaka/acc/Traits.hpp"
9 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/dim/Traits.hpp"
11 #include "alpaka/idx/Traits.hpp"
13 
14 // Implementation details.
16 #include "alpaka/core/Decay.hpp"
18 #include "alpaka/dev/DevCpu.hpp"
19 #include "alpaka/idx/MapIdx.hpp"
20 #include "alpaka/kernel/Traits.hpp"
22 
23 #include <functional>
24 #include <stdexcept>
25 #include <tuple>
26 #include <type_traits>
27 #include <utility>
28 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
29 # include <iostream>
30 #endif
31 
32 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
33 
34 # if _OPENMP < 200203
35 # error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
36 # endif
37 
38 # include <omp.h>
39 
40 namespace alpaka
41 {
42  namespace detail
43  {
44  //! Executor of parallel OpenMP loop with the given schedule
45  //!
46  //! Is explicitly specialized for all supported schedule kinds to help code optimization by compilers.
47  //!
48  //! \tparam TKernel The kernel type.
49  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
50  //! \tparam TScheduleKind The schedule kind value.
51  template<typename TKernel, typename TSchedule, omp::Schedule::Kind TScheduleKind>
53 
54  //! Executor of parallel OpenMP loop with no schedule set
55  //!
56  //! Does not use chunk size.
57  //!
58  //! \tparam TKernel The kernel type.
59  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
60  template<typename TKernel, typename TSchedule>
61  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>
62  {
63  //! Run parallel OpenMP loop
64  //!
65  //! \tparam TLoopBody The loop body functor type.
66  //! \tparam TIdx The index type.
67  //!
68  //! \param loopBody The loop body functor instance, takes iteration index as input.
69  //! \param numIterations The number of loop iterations.
70  template<typename TLoopBody, typename TIdx>
72  TKernel const&,
73  TLoopBody&& loopBody,
74  TIdx const numIterations,
75  TSchedule const&)
76  {
77 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
78  // header.
79  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
80  std::intmax_t i;
81 # pragma omp for nowait
82  for(i = 0; i < iNumBlocksInGrid; ++i)
83 # else
84 # pragma omp for nowait
85  for(TIdx i = 0; i < numIterations; ++i)
86 # endif
87  {
88  // Make another lambda to work around #1288
89  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
90  wrappedLoopBody(i);
91  }
92  }
93  };
94 
95  /* Implementations for Static, Dynamic and Guided follow the same pattern.
96  * There are two specializations of ParallelForImpl for compile-time dispatch depending on whether the
97  * OmpSchedule trait is specialized.
98  * The no trait case is further compile-time dispatched with a helper ParallelForStaticImpl.
99  * It is based on whether ompScheduleChunkSize member is available.
100  */
101 
102  //! Executor of parallel OpenMP loop with the static schedule
103  //!
104  //! Specialization for kernels specializing the OmpSchedule trait.
105  //!
106  //! \tparam TKernel The kernel type.
107  template<typename TKernel>
108  struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>
109  {
110  //! Run parallel OpenMP loop
111  //!
112  //! \tparam TLoopBody The loop body functor type.
113  //! \tparam TIdx The index type.
114  //!
115  //! \param loopBody The loop body functor instance, takes iteration index as input.
116  //! \param numIterations The number of loop iterations.
117  //! \param schedule The schedule object.
118  template<typename TLoopBody, typename TIdx>
120  TKernel const&,
121  TLoopBody&& loopBody,
122  TIdx const numIterations,
123  omp::Schedule const& schedule)
124  {
125 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
126  // header.
127  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
128  std::intmax_t i;
129 # pragma omp for nowait schedule(static, schedule.chunkSize)
130  for(i = 0; i < iNumBlocksInGrid; ++i)
131 # else
132 # pragma omp for nowait schedule(static, schedule.chunkSize)
133  for(TIdx i = 0; i < numIterations; ++i)
134 # endif
135  {
136  // Make another lambda to work around #1288
137  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
138  wrappedLoopBody(i);
139  }
140  }
141  };
142 
143  //! Helper executor of parallel OpenMP loop with the static schedule
144  //!
145  //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
146  //!
147  //! \tparam TKernel The kernel type.
148  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
149  template<typename TKernel, typename TSchedule, typename TSfinae = void>
151  {
152  //! Run parallel OpenMP loop
153  //!
154  //! \tparam TLoopBody The loop body functor type.
155  //! \tparam TIdx The index type.
156  //!
157  //! \param loopBody The loop body functor instance, takes iteration index as input.
158  //! \param numIterations The number of loop iterations.
159  template<typename TLoopBody, typename TIdx>
161  TKernel const&,
162  TLoopBody&& loopBody,
163  TIdx const numIterations,
164  TSchedule const&)
165  {
166 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
167  // header.
168  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
169  std::intmax_t i;
170 # pragma omp for nowait schedule(static)
171  for(i = 0; i < iNumBlocksInGrid; ++i)
172 # else
173 # pragma omp for nowait schedule(static)
174  for(TIdx i = 0; i < numIterations; ++i)
175 # endif
176  {
177  // Make another lambda to work around #1288
178  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
179  wrappedLoopBody(i);
180  }
181  }
182  };
183 
184  //! Helper type to check if TKernel has member ompScheduleChunkSize
185  //!
186  //! Is void for those types, ill-formed otherwise.
187  //!
188  //! \tparam TKernel The kernel type.
189  template<typename TKernel>
190  using HasScheduleChunkSize = std::void_t<decltype(TKernel::ompScheduleChunkSize)>;
191 
192  //! Helper executor of parallel OpenMP loop with the static schedule
193  //!
194  //! Specialization for kernels with ompScheduleChunkSize member.
195  //!
196  //! \tparam TKernel The kernel type.
197  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
198  template<typename TKernel, typename TSchedule>
199  struct ParallelForStaticImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
200  {
201  //! Run parallel OpenMP loop
202  //!
203  //! \tparam TLoopBody The loop body functor type.
204  //! \tparam TIdx The index type.
205  //!
206  //! \param kernel The kernel instance reference
207  //! \param loopBody The loop body functor instance, takes iteration index as input.
208  //! \param numIterations The number of loop iterations.
209  template<typename TLoopBody, typename TIdx>
211  TKernel const& kernel,
212  TLoopBody&& loopBody,
213  TIdx const numIterations,
214  TSchedule const&)
215  {
216 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
217  // header.
218  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
219  std::intmax_t i;
220 # pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
221  for(i = 0; i < iNumBlocksInGrid; ++i)
222 # else
223 # pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
224  for(TIdx i = 0; i < numIterations; ++i)
225 # endif
226  {
227  // Make another lambda to work around #1288
228  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
229  wrappedLoopBody(i);
230  }
231  }
232  };
233 
234  //! Executor of parallel OpenMP loop with the static schedule
235  //!
236  //! Specialization for kernels not specializing the OmpSchedule trait.
237  //! Falls back to ParallelForStaticImpl for further dispatch.
238  //!
239  //! \tparam TKernel The kernel type.
240  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
241  template<typename TKernel, typename TSchedule>
242  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Static> : ParallelForStaticImpl<TKernel, TSchedule>
243  {
244  };
245 
246  //! Executor of parallel OpenMP loop with the dynamic schedule
247  //!
248  //! Specialization for kernels specializing the OmpSchedule trait.
249  //!
250  //! \tparam TKernel The kernel type.
251  template<typename TKernel>
252  struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>
253  {
254  //! Run parallel OpenMP loop
255  //!
256  //! \tparam TLoopBody The loop body functor type.
257  //! \tparam TIdx The index type.
258  //!
259  //! \param loopBody The loop body functor instance, takes iteration index as input.
260  //! \param numIterations The number of loop iterations.
261  //! \param schedule The schedule object.
262  template<typename TLoopBody, typename TIdx>
264  TKernel const&,
265  TLoopBody&& loopBody,
266  TIdx const numIterations,
267  omp::Schedule const& schedule)
268  {
269 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
270  // header.
271  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
272  std::intmax_t i;
273 # pragma omp for nowait schedule(dynamic, schedule.chunkSize)
274  for(i = 0; i < iNumBlocksInGrid; ++i)
275 # else
276 # pragma omp for nowait schedule(dynamic, schedule.chunkSize)
277  for(TIdx i = 0; i < numIterations; ++i)
278 # endif
279  {
280  // Make another lambda to work around #1288
281  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
282  wrappedLoopBody(i);
283  }
284  }
285  };
286 
287  //! Helper executor of parallel OpenMP loop with the dynamic schedule
288  //!
289  //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
290  //!
291  //! \tparam TKernel The kernel type.
292  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
293  template<typename TKernel, typename TSchedule, typename TSfinae = void>
295  {
296  //! Run parallel OpenMP loop
297  //!
298  //! \tparam TLoopBody The loop body functor type.
299  //! \tparam TIdx The index type.
300  //!
301  //! \param loopBody The loop body functor instance, takes iteration index as input.
302  //! \param numIterations The number of loop iterations.
303  template<typename TLoopBody, typename TIdx>
305  TKernel const&,
306  TLoopBody&& loopBody,
307  TIdx const numIterations,
308  TSchedule const&)
309  {
310 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
311  // header.
312  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
313  std::intmax_t i;
314 # pragma omp for nowait schedule(dynamic)
315  for(i = 0; i < iNumBlocksInGrid; ++i)
316 # else
317 # pragma omp for nowait schedule(dynamic)
318  for(TIdx i = 0; i < numIterations; ++i)
319 # endif
320  {
321  // Make another lambda to work around #1288
322  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
323  wrappedLoopBody(i);
324  }
325  }
326  };
327 
328  //! Helper executor of parallel OpenMP loop with the dynamic schedule
329  //!
330  //! Specialization for kernels with ompScheduleChunkSize member.
331  //!
332  //! \tparam TKernel The kernel type.
333  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
334  template<typename TKernel, typename TSchedule>
335  struct ParallelForDynamicImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
336  {
337  //! Run parallel OpenMP loop
338  //!
339  //! \tparam TLoopBody The loop body functor type.
340  //! \tparam TIdx The index type.
341  //!
342  //! \param kernel The kernel instance reference
343  //! \param loopBody The loop body functor instance, takes iteration index as input.
344  //! \param numIterations The number of loop iterations.
345  template<typename TLoopBody, typename TIdx>
347  TKernel const& kernel,
348  TLoopBody&& loopBody,
349  TIdx const numIterations,
350  TSchedule const&)
351  {
352 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
353  // header.
354  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
355  std::intmax_t i;
356 # pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
357  for(i = 0; i < iNumBlocksInGrid; ++i)
358 # else
359 # pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
360  for(TIdx i = 0; i < numIterations; ++i)
361 # endif
362  {
363  // Make another lambda to work around #1288
364  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
365  wrappedLoopBody(i);
366  }
367  }
368  };
369 
370  //! Executor of parallel OpenMP loop with the dynamic schedule
371  //!
372  //! Specialization for kernels not specializing the OmpSchedule trait.
373  //! Falls back to ParallelForDynamicImpl for further dispatch.
374  //!
375  //! \tparam TKernel The kernel type.
376  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
377  template<typename TKernel, typename TSchedule>
378  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Dynamic> : ParallelForDynamicImpl<TKernel, TSchedule>
379  {
380  };
381 
382  //! Executor of parallel OpenMP loop with the guided schedule
383  //!
384  //! Specialization for kernels specializing the OmpSchedule trait.
385  //!
386  //! \tparam TKernel The kernel type.
387  template<typename TKernel>
388  struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>
389  {
390  //! Run parallel OpenMP loop
391  //!
392  //! \tparam TLoopBody The loop body functor type.
393  //! \tparam TIdx The index type.
394  //!
395  //! \param loopBody The loop body functor instance, takes iteration index as input.
396  //! \param numIterations The number of loop iterations.
397  //! \param schedule The schedule object.
398  template<typename TLoopBody, typename TIdx>
400  TKernel const&,
401  TLoopBody&& loopBody,
402  TIdx const numIterations,
403  omp::Schedule const& schedule)
404  {
405 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
406  // header.
407  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
408  std::intmax_t i;
409 # pragma omp for nowait schedule(guided, schedule.chunkSize)
410  for(i = 0; i < iNumBlocksInGrid; ++i)
411 # else
412 # pragma omp for nowait schedule(guided, schedule.chunkSize)
413  for(TIdx i = 0; i < numIterations; ++i)
414 # endif
415  {
416  // Make another lambda to work around #1288
417  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
418  wrappedLoopBody(i);
419  }
420  }
421  };
422 
423  //! Helper executor of parallel OpenMP loop with the guided schedule
424  //!
425  //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
426  //!
427  //! \tparam TKernel The kernel type.
428  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
429  template<typename TKernel, typename TSchedule, typename TSfinae = void>
431  {
432  //! Run parallel OpenMP loop
433  //!
434  //! \tparam TLoopBody The loop body functor type.
435  //! \tparam TIdx The index type.
436  //!
437  //! \param loopBody The loop body functor instance, takes iteration index as input.
438  //! \param numIterations The number of loop iterations.
439  template<typename TLoopBody, typename TIdx>
441  TKernel const&,
442  TLoopBody&& loopBody,
443  TIdx const numIterations,
444  TSchedule const&)
445  {
446 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
447  // header.
448  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
449  std::intmax_t i;
450 # pragma omp for nowait schedule(guided)
451  for(i = 0; i < iNumBlocksInGrid; ++i)
452 # else
453 # pragma omp for nowait schedule(guided)
454  for(TIdx i = 0; i < numIterations; ++i)
455 # endif
456  {
457  // Make another lambda to work around #1288
458  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
459  wrappedLoopBody(i);
460  }
461  }
462  };
463 
464  //! Helper executor of parallel OpenMP loop with the guided schedule
465  //!
466  //! Specialization for kernels with ompScheduleChunkSize member.
467  //!
468  //! \tparam TKernel The kernel type.
469  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
470  template<typename TKernel, typename TSchedule>
471  struct ParallelForGuidedImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
472  {
473  //! Run parallel OpenMP loop
474  //!
475  //! \tparam TLoopBody The loop body functor type.
476  //! \tparam TIdx The index type.
477  //!
478  //! \param kernel The kernel instance reference
479  //! \param loopBody The loop body functor instance, takes iteration index as input.
480  //! \param numIterations The number of loop iterations.
481  template<typename TLoopBody, typename TIdx>
483  TKernel const& kernel,
484  TLoopBody&& loopBody,
485  TIdx const numIterations,
486  TSchedule const&)
487  {
488 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
489  // header.
490  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
491  std::intmax_t i;
492 # pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
493  for(i = 0; i < iNumBlocksInGrid; ++i)
494 # else
495 # pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
496  for(TIdx i = 0; i < numIterations; ++i)
497 # endif
498  {
499  // Make another lambda to work around #1288
500  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
501  wrappedLoopBody(i);
502  }
503  }
504  };
505 
506  //! Executor of parallel OpenMP loop with the guided schedule
507  //!
508  //! Specialization for kernels not specializing the OmpSchedule trait.
509  //! Falls back to ParallelForGuidedImpl for further dispatch.
510  //!
511  //! \tparam TKernel The kernel type.
512  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
513  template<typename TKernel, typename TSchedule>
514  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Guided> : ParallelForGuidedImpl<TKernel, TSchedule>
515  {
516  };
517 
518 # if _OPENMP >= 200805
519  //! Executor of parallel OpenMP loop with auto schedule set
520  //!
521  //! Does not use chunk size.
522  //!
523  //! \tparam TKernel The kernel type.
524  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
525  template<typename TKernel, typename TSchedule>
526  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Auto>
527  {
528  //! Run parallel OpenMP loop
529  //!
530  //! \tparam TLoopBody The loop body functor type.
531  //! \tparam TIdx The index type.
532  //!
533  //! \param loopBody The loop body functor instance, takes iteration index as input.
534  //! \param numIterations The number of loop iterations.
535  template<typename TLoopBody, typename TIdx>
537  TKernel const&,
538  TLoopBody&& loopBody,
539  TIdx const numIterations,
540  TSchedule const&)
541  {
542 # pragma omp for nowait schedule(auto)
543  for(TIdx i = 0; i < numIterations; ++i)
544  {
545  // Make another lambda to work around #1288
546  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
547  wrappedLoopBody(i);
548  }
549  }
550  };
551 # endif
552 
553  //! Executor of parallel OpenMP loop with runtime schedule set
554  //!
555  //! Does not use chunk size.
556  //!
557  //! \tparam TKernel The kernel type.
558  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
559  template<typename TKernel, typename TSchedule>
560  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Runtime>
561  {
562  //! Run parallel OpenMP loop
563  //!
564  //! \tparam TLoopBody The loop body functor type.
565  //! \tparam TIdx The index type.
566  //!
567  //! \param loopBody The loop body functor instance, takes iteration index as input.
568  //! \param numIterations The number of loop iterations.
569  template<typename TLoopBody, typename TIdx>
571  TKernel const&,
572  TLoopBody&& loopBody,
573  TIdx const numIterations,
574  TSchedule const&)
575  {
576 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
577  // header.
578  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
579  std::intmax_t i;
580 # pragma omp for nowait schedule(runtime)
581  for(i = 0; i < iNumBlocksInGrid; ++i)
582 # else
583 # pragma omp for nowait schedule(runtime)
584  for(TIdx i = 0; i < numIterations; ++i)
585 # endif
586  {
587  // Make another lambda to work around #1288
588  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
589  wrappedLoopBody(i);
590  }
591  }
592  };
593 
594  //! Executor of parallel OpenMP loop
595  //!
596  //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
597  //! The default implementation is for the kernels that do not set schedule in any way, compile-time dispatch.
598  //!
599  //! \tparam TKernel The kernel type.
600  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
601  template<typename TKernel, typename TSchedule, typename TSfinae = void>
602  struct ParallelFor
603  {
604  //! Run parallel OpenMP loop
605  //!
606  //! \tparam TLoopBody The loop body functor type.
607  //! \tparam TIdx The index type.
608  //!
609  //! \param kernel The kernel instance reference
610  //! \param loopBody The loop body functor instance, takes iteration index as input.
611  //! \param numIterations The number of loop iterations.
612  //! \param schedule The schedule object.
613  template<typename TLoopBody, typename TIdx>
615  TKernel const& kernel,
616  TLoopBody&& loopBody,
617  TIdx const numIterations,
618  TSchedule const& schedule)
619  {
620  // Forward to ParallelForImpl that performs dispatch by by chunk size
622  kernel,
623  std::forward<TLoopBody>(loopBody),
624  numIterations,
625  schedule);
626  }
627  };
628 
629  //! Executor of parallel OpenMP loop
630  //!
631  //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
632  //! Specialization for kernels specializing the OmpSchedule trait, run-time dispatch.
633  //!
634  //! \tparam TKernel The kernel type.
635  template<typename TKernel>
636  struct ParallelFor<TKernel, omp::Schedule>
637  {
638  //! Run parallel OpenMP loop
639  //!
640  //! \tparam TLoopBody The loop body functor type.
641  //! \tparam TIdx The index type.
642  //!
643  //! \param kernel The kernel instance reference
644  //! \param loopBody The loop body functor instance, takes iteration index as input.
645  //! \param numIterations The number of loop iterations.
646  //! \param schedule The schedule object.
647  template<typename TLoopBody, typename TIdx>
649  TKernel const& kernel,
650  TLoopBody&& loopBody,
651  TIdx const numIterations,
652  omp::Schedule const& schedule)
653  {
654  // Forward to ParallelForImpl that performs dispatch by by chunk size
655  switch(schedule.kind)
656  {
659  kernel,
660  std::forward<TLoopBody>(loopBody),
661  numIterations,
662  schedule);
663  break;
666  kernel,
667  std::forward<TLoopBody>(loopBody),
668  numIterations,
669  schedule);
670  break;
673  kernel,
674  std::forward<TLoopBody>(loopBody),
675  numIterations,
676  schedule);
677  break;
680  kernel,
681  std::forward<TLoopBody>(loopBody),
682  numIterations,
683  schedule);
684  break;
685 # if _OPENMP >= 200805
686  case omp::Schedule::Auto:
688  kernel,
689  std::forward<TLoopBody>(loopBody),
690  numIterations,
691  schedule);
692  break;
693 # endif
696  kernel,
697  std::forward<TLoopBody>(loopBody),
698  numIterations,
699  schedule);
700  break;
701  }
702  }
703  };
704 
705  //! Helper type to check if TSchedule is a type originating from OmpSchedule trait definition
706  //!
707  //! \tparam TSchedule The schedule type.
708  template<typename TSchedule>
710  = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
711 
712  //! Helper type to check if member ompScheduleKind of TKernel should be used
713  //!
714  //! For that it has to be present, and no OmpSchedule trait specialized.
715  //! Is void for those types, ill-formed otherwise.
716  //!
717  //! \tparam TKernel The kernel type.
718  //! \tparam TSchedule The schedule type.
719  template<typename TKernel, typename TSchedule>
721  = std::enable_if_t<sizeof(TKernel::ompScheduleKind) && !IsOmpScheduleTraitSpecialized<TSchedule>::value>;
722 
723  //! Executor of parallel OpenMP loop
724  //!
725  //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
726  //! Specialization for kernels with ompScheduleKind member, compile-time dispatch.
727  //!
728  //! \tparam TKernel The kernel type.
729  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
730  template<typename TKernel, typename TSchedule>
731  struct ParallelFor<TKernel, TSchedule, UseScheduleKind<TKernel, TSchedule>>
732  {
733  //! Run parallel OpenMP loop
734  //!
735  //! \tparam TLoopBody The loop body functor type.
736  //! \tparam TIdx The index type.
737  //!
738  //! \param kernel The kernel instance reference
739  //! \param loopBody The loop body functor instance, takes iteration index as input.
740  //! \param numIterations The number of loop iterations.
741  //! \param schedule The schedule object.
742  template<typename TLoopBody, typename TIdx>
744  TKernel const& kernel,
745  TLoopBody&& loopBody,
746  TIdx const numIterations,
747  TSchedule const& schedule)
748  {
749  // Forward to ParallelForImpl that performs dispatch by by chunk size
751  kernel,
752  std::forward<TLoopBody>(loopBody),
753  numIterations,
754  schedule);
755  }
756  };
757 
758  //! Run parallel OpenMP loop
759  //!
760  //! \tparam TKernel The kernel type.
761  //! \tparam TLoopBody The loop body functor type.
762  //! \tparam TIdx The index type.
763  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
764  //!
765  //! \param kernel The kernel instance reference,
766  //! not perfect=forwarded to shorten SFINAE internally.
767  //! \param loopBody The loop body functor instance, takes iteration index as input.
768  //! \param numIterations The number of loop iterations.
769  //! \param schedule The schedule object.
770  template<typename TKernel, typename TLoopBody, typename TIdx, typename TSchedule>
772  TKernel const& kernel,
773  TLoopBody&& loopBody,
774  TIdx const numIterations,
775  TSchedule const& schedule)
776  {
777  // Forward to ParallelFor that performs first a dispatch by schedule kind, and then by chunk size
778  ParallelFor<TKernel, TSchedule>{}(kernel, std::forward<TLoopBody>(loopBody), numIterations, schedule);
779  }
780 
781  } // namespace detail
782 
783  //! The CPU OpenMP 2.0 block accelerator execution task.
784  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
785  class TaskKernelCpuOmp2Blocks final : public WorkDivMembers<TDim, TIdx>
786  {
787  public:
788  template<typename TWorkDiv>
789  ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
790  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
791  , m_kernelFnObj(kernelFnObj)
792  , m_args(std::forward<TArgs>(args)...)
793  {
794  static_assert(
795  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
796  "The work division and the execution task have to be of the same dimensionality!");
797  }
798 
799  //! Executes the kernel function object.
800  ALPAKA_FN_HOST auto operator()() const -> void
801  {
803 
804  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
805  auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
806  auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
807 
808  // Get the size of the block shared dynamic memory.
809  auto const blockSharedMemDynSizeBytes = std::apply(
810  [&](std::decay_t<TArgs> const&... args)
811  {
812  return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
813  m_kernelFnObj,
814  blockThreadExtent,
815  threadElemExtent,
816  args...);
817  },
818  m_args);
819 
820 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
821  std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
822  << std::endl;
823 # endif
824 
825  // The number of blocks in the grid.
826  TIdx const numBlocksInGrid(gridBlockExtent.prod());
827  if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
828  {
829  throw std::runtime_error("Only one thread per block allowed in the OpenMP 2.0 block accelerator!");
830  }
831 
832  // Get the OpenMP schedule information for the given kernel and parameter types
833  auto const schedule = std::apply(
834  [&](std::decay_t<TArgs> const&... args) {
835  return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
836  m_kernelFnObj,
837  blockThreadExtent,
838  threadElemExtent,
839  args...);
840  },
841  m_args);
842 
843  if(::omp_in_parallel() != 0)
844  {
845 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
846  std::cout << __func__ << " already within a parallel region." << std::endl;
847 # endif
848  parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
849  }
850  else
851  {
852 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
853  std::cout << __func__ << " opening new parallel region." << std::endl;
854 # endif
855 # pragma omp parallel
856  parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
857  }
858  }
859 
860  private:
861  template<typename TSchedule>
862  ALPAKA_FN_HOST auto parallelFn(
863  std::size_t const& blockSharedMemDynSizeBytes,
864  TIdx const& numBlocksInGrid,
865  Vec<TDim, TIdx> const& gridBlockExtent,
866  TSchedule const& schedule) const -> void
867  {
868 # pragma omp single nowait
869  {
870  // The OpenMP runtime does not create a parallel region when either:
871  // * only one thread is required in the num_threads clause
872  // * or only one thread is available
873  // In all other cases we expect to be in a parallel region now.
874  if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
875  {
876  throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
877  }
878 
879 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
880  std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
881 # endif
882  }
883 
884  AccCpuOmp2Blocks<TDim, TIdx> acc(
885  *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
886  blockSharedMemDynSizeBytes);
887 
888  // Body of the OpenMP parallel loop to be executed.
889  // Index type is auto since we have a difference for OpenMP 2.0 and later ones
890  auto loopBody = [&](auto currentIndex)
891  {
892 # if _OPENMP < 200805
893  auto const i_tidx = static_cast<TIdx>(currentIndex); // for issue #840
894  auto const index = Vec<DimInt<1u>, TIdx>(i_tidx); // for issue #840
895 # else
896  auto const index = Vec<DimInt<1u>, TIdx>(currentIndex); // for issue #840
897 # endif
898  acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
899 
900  std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
901 
902  // After a block has been processed, the shared memory has to be deleted.
903  freeSharedVars(acc);
904  };
905 
906  detail::parallelFor(m_kernelFnObj, loopBody, numBlocksInGrid, schedule);
907  }
908 
909  TKernelFnObj m_kernelFnObj;
910  std::tuple<std::decay_t<TArgs>...> m_args;
911  };
912 
913  namespace trait
914  {
915  //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
916  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
917  struct AccType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
918  {
920  };
921 
922  //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
923  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
924  struct DevType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
925  {
926  using type = DevCpu;
927  };
928 
929  //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
930  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
931  struct DimType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
932  {
933  using type = TDim;
934  };
935 
936  //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
937  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
938  struct PlatformType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
939  {
940  using type = PlatformCpu;
941  };
942 
943  //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
944  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
945  struct IdxType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
946  {
947  using type = TIdx;
948  };
949  } // namespace trait
950 } // namespace alpaka
951 
952 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU OpenMP 2.0 block accelerator.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU OpenMP 2.0 block accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
Definition: Common.hpp:95
std::void_t< decltype(TKernel::ompScheduleChunkSize)> HasScheduleChunkSize
Helper type to check if TKernel has member ompScheduleChunkSize.
std::integral_constant< bool, std::is_same< TSchedule, omp::Schedule >::value > IsOmpScheduleTraitSpecialized
Helper type to check if TSchedule is a type originating from OmpSchedule trait definition.
ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
std::enable_if_t< sizeof(TKernel::ompScheduleKind) &&!IsOmpScheduleTraitSpecialized< TSchedule >::value > UseScheduleKind
Helper type to check if member ompScheduleKind of TKernel should be used.
The alpaka accelerator library.
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition: Traits.hpp:54
The CPU device platform.
Definition: PlatformCpu.hpp:18
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the given schedule.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless...
Definition: OmpSchedule.hpp:20
int chunkSize
Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a fixed-w...
Definition: OmpSchedule.hpp:44
Kind kind
Schedule kind.
Definition: OmpSchedule.hpp:40
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30