alpaka
Abstraction Library for Parallel Kernel Acceleration
TaskKernelCpuOmp2Blocks.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Bert Wesarg, RenĂ© Widera, Sergei Bastrakov, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Specialized traits.
8 #include "alpaka/acc/Traits.hpp"
9 #include "alpaka/dev/Traits.hpp"
10 #include "alpaka/dim/Traits.hpp"
11 #include "alpaka/idx/Traits.hpp"
13 
14 // Implementation details.
16 #include "alpaka/core/Decay.hpp"
18 #include "alpaka/dev/DevCpu.hpp"
19 #include "alpaka/idx/MapIdx.hpp"
21 #include "alpaka/kernel/Traits.hpp"
24 
25 #include <functional>
26 #include <stdexcept>
27 #include <tuple>
28 #include <type_traits>
29 #include <utility>
30 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
31 # include <iostream>
32 #endif
33 
34 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
35 
36 # if BOOST_COMP_CLANG
37 # pragma clang diagnostic push
38 # pragma clang diagnostic ignored "-Wswitch-default"
39 # endif
40 
41 # if _OPENMP < 200203
42 # error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
43 # endif
44 
45 # include <omp.h>
46 
47 namespace alpaka
48 {
49  namespace detail
50  {
51  //! Executor of parallel OpenMP loop with the given schedule
52  //!
53  //! Is explicitly specialized for all supported schedule kinds to help code optimization by compilers.
54  //!
55  //! \tparam TKernel The kernel type.
56  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
57  //! \tparam TScheduleKind The schedule kind value.
58  template<typename TKernel, typename TSchedule, omp::Schedule::Kind TScheduleKind>
60 
61  //! Executor of parallel OpenMP loop with no schedule set
62  //!
63  //! Does not use chunk size.
64  //!
65  //! \tparam TKernel The kernel type.
66  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
67  template<typename TKernel, typename TSchedule>
68  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>
69  {
70  //! Run parallel OpenMP loop
71  //!
72  //! \tparam TLoopBody The loop body functor type.
73  //! \tparam TIdx The index type.
74  //!
75  //! \param loopBody The loop body functor instance, takes iteration index as input.
76  //! \param numIterations The number of loop iterations.
77  template<typename TLoopBody, typename TIdx>
79  TKernel const&,
80  TLoopBody&& loopBody,
81  TIdx const numIterations,
82  TSchedule const&)
83  {
84 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
85  // header.
86  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
87  std::intmax_t i;
88 # pragma omp for nowait
89  for(i = 0; i < iNumBlocksInGrid; ++i)
90 # else
91 # pragma omp for nowait
92  for(TIdx i = 0; i < numIterations; ++i)
93 # endif
94  {
95  // Make another lambda to work around #1288
96  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
97  wrappedLoopBody(i);
98  }
99  }
100  };
101 
102  /* Implementations for Static, Dynamic and Guided follow the same pattern.
103  * There are two specializations of ParallelForImpl for compile-time dispatch depending on whether the
104  * OmpSchedule trait is specialized.
105  * The no trait case is further compile-time dispatched with a helper ParallelForStaticImpl.
106  * It is based on whether ompScheduleChunkSize member is available.
107  */
108 
109  //! Executor of parallel OpenMP loop with the static schedule
110  //!
111  //! Specialization for kernels specializing the OmpSchedule trait.
112  //!
113  //! \tparam TKernel The kernel type.
114  template<typename TKernel>
115  struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>
116  {
117  //! Run parallel OpenMP loop
118  //!
119  //! \tparam TLoopBody The loop body functor type.
120  //! \tparam TIdx The index type.
121  //!
122  //! \param loopBody The loop body functor instance, takes iteration index as input.
123  //! \param numIterations The number of loop iterations.
124  //! \param schedule The schedule object.
125  template<typename TLoopBody, typename TIdx>
127  TKernel const&,
128  TLoopBody&& loopBody,
129  TIdx const numIterations,
130  omp::Schedule const& schedule)
131  {
132 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
133  // header.
134  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
135  std::intmax_t i;
136 # pragma omp for nowait schedule(static, schedule.chunkSize)
137  for(i = 0; i < iNumBlocksInGrid; ++i)
138 # else
139 # pragma omp for nowait schedule(static, schedule.chunkSize)
140  for(TIdx i = 0; i < numIterations; ++i)
141 # endif
142  {
143  // Make another lambda to work around #1288
144  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
145  wrappedLoopBody(i);
146  }
147  }
148  };
149 
150  //! Helper executor of parallel OpenMP loop with the static schedule
151  //!
152  //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
153  //!
154  //! \tparam TKernel The kernel type.
155  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
156  template<typename TKernel, typename TSchedule, typename TSfinae = void>
158  {
159  //! Run parallel OpenMP loop
160  //!
161  //! \tparam TLoopBody The loop body functor type.
162  //! \tparam TIdx The index type.
163  //!
164  //! \param loopBody The loop body functor instance, takes iteration index as input.
165  //! \param numIterations The number of loop iterations.
166  template<typename TLoopBody, typename TIdx>
168  TKernel const&,
169  TLoopBody&& loopBody,
170  TIdx const numIterations,
171  TSchedule const&)
172  {
173 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
174  // header.
175  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
176  std::intmax_t i;
177 # pragma omp for nowait schedule(static)
178  for(i = 0; i < iNumBlocksInGrid; ++i)
179 # else
180 # pragma omp for nowait schedule(static)
181  for(TIdx i = 0; i < numIterations; ++i)
182 # endif
183  {
184  // Make another lambda to work around #1288
185  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
186  wrappedLoopBody(i);
187  }
188  }
189  };
190 
191  //! Helper type to check if TKernel has member ompScheduleChunkSize
192  //!
193  //! Is void for those types, ill-formed otherwise.
194  //!
195  //! \tparam TKernel The kernel type.
196  template<typename TKernel>
197  using HasScheduleChunkSize = std::void_t<decltype(TKernel::ompScheduleChunkSize)>;
198 
199  //! Helper executor of parallel OpenMP loop with the static schedule
200  //!
201  //! Specialization for kernels with ompScheduleChunkSize member.
202  //!
203  //! \tparam TKernel The kernel type.
204  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
205  template<typename TKernel, typename TSchedule>
206  struct ParallelForStaticImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
207  {
208  //! Run parallel OpenMP loop
209  //!
210  //! \tparam TLoopBody The loop body functor type.
211  //! \tparam TIdx The index type.
212  //!
213  //! \param kernel The kernel instance reference
214  //! \param loopBody The loop body functor instance, takes iteration index as input.
215  //! \param numIterations The number of loop iterations.
216  template<typename TLoopBody, typename TIdx>
218  TKernel const& kernel,
219  TLoopBody&& loopBody,
220  TIdx const numIterations,
221  TSchedule const&)
222  {
223 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
224  // header.
225  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
226  std::intmax_t i;
227 # pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
228  for(i = 0; i < iNumBlocksInGrid; ++i)
229 # else
230 # pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
231  for(TIdx i = 0; i < numIterations; ++i)
232 # endif
233  {
234  // Make another lambda to work around #1288
235  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
236  wrappedLoopBody(i);
237  }
238  }
239  };
240 
241  //! Executor of parallel OpenMP loop with the static schedule
242  //!
243  //! Specialization for kernels not specializing the OmpSchedule trait.
244  //! Falls back to ParallelForStaticImpl for further dispatch.
245  //!
246  //! \tparam TKernel The kernel type.
247  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
248  template<typename TKernel, typename TSchedule>
249  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Static> : ParallelForStaticImpl<TKernel, TSchedule>
250  {
251  };
252 
253  //! Executor of parallel OpenMP loop with the dynamic schedule
254  //!
255  //! Specialization for kernels specializing the OmpSchedule trait.
256  //!
257  //! \tparam TKernel The kernel type.
258  template<typename TKernel>
259  struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>
260  {
261  //! Run parallel OpenMP loop
262  //!
263  //! \tparam TLoopBody The loop body functor type.
264  //! \tparam TIdx The index type.
265  //!
266  //! \param loopBody The loop body functor instance, takes iteration index as input.
267  //! \param numIterations The number of loop iterations.
268  //! \param schedule The schedule object.
269  template<typename TLoopBody, typename TIdx>
271  TKernel const&,
272  TLoopBody&& loopBody,
273  TIdx const numIterations,
274  omp::Schedule const& schedule)
275  {
276 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
277  // header.
278  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
279  std::intmax_t i;
280 # pragma omp for nowait schedule(dynamic, schedule.chunkSize)
281  for(i = 0; i < iNumBlocksInGrid; ++i)
282 # else
283 # pragma omp for nowait schedule(dynamic, schedule.chunkSize)
284  for(TIdx i = 0; i < numIterations; ++i)
285 # endif
286  {
287  // Make another lambda to work around #1288
288  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
289  wrappedLoopBody(i);
290  }
291  }
292  };
293 
294  //! Helper executor of parallel OpenMP loop with the dynamic schedule
295  //!
296  //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
297  //!
298  //! \tparam TKernel The kernel type.
299  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
300  template<typename TKernel, typename TSchedule, typename TSfinae = void>
302  {
303  //! Run parallel OpenMP loop
304  //!
305  //! \tparam TLoopBody The loop body functor type.
306  //! \tparam TIdx The index type.
307  //!
308  //! \param loopBody The loop body functor instance, takes iteration index as input.
309  //! \param numIterations The number of loop iterations.
310  template<typename TLoopBody, typename TIdx>
312  TKernel const&,
313  TLoopBody&& loopBody,
314  TIdx const numIterations,
315  TSchedule const&)
316  {
317 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
318  // header.
319  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
320  std::intmax_t i;
321 # pragma omp for nowait schedule(dynamic)
322  for(i = 0; i < iNumBlocksInGrid; ++i)
323 # else
324 # pragma omp for nowait schedule(dynamic)
325  for(TIdx i = 0; i < numIterations; ++i)
326 # endif
327  {
328  // Make another lambda to work around #1288
329  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
330  wrappedLoopBody(i);
331  }
332  }
333  };
334 
335  //! Helper executor of parallel OpenMP loop with the dynamic schedule
336  //!
337  //! Specialization for kernels with ompScheduleChunkSize member.
338  //!
339  //! \tparam TKernel The kernel type.
340  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
341  template<typename TKernel, typename TSchedule>
342  struct ParallelForDynamicImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
343  {
344  //! Run parallel OpenMP loop
345  //!
346  //! \tparam TLoopBody The loop body functor type.
347  //! \tparam TIdx The index type.
348  //!
349  //! \param kernel The kernel instance reference
350  //! \param loopBody The loop body functor instance, takes iteration index as input.
351  //! \param numIterations The number of loop iterations.
352  template<typename TLoopBody, typename TIdx>
354  TKernel const& kernel,
355  TLoopBody&& loopBody,
356  TIdx const numIterations,
357  TSchedule const&)
358  {
359 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
360  // header.
361  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
362  std::intmax_t i;
363 # pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
364  for(i = 0; i < iNumBlocksInGrid; ++i)
365 # else
366 # pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
367  for(TIdx i = 0; i < numIterations; ++i)
368 # endif
369  {
370  // Make another lambda to work around #1288
371  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
372  wrappedLoopBody(i);
373  }
374  }
375  };
376 
377  //! Executor of parallel OpenMP loop with the dynamic schedule
378  //!
379  //! Specialization for kernels not specializing the OmpSchedule trait.
380  //! Falls back to ParallelForDynamicImpl for further dispatch.
381  //!
382  //! \tparam TKernel The kernel type.
383  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
384  template<typename TKernel, typename TSchedule>
385  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Dynamic> : ParallelForDynamicImpl<TKernel, TSchedule>
386  {
387  };
388 
389  //! Executor of parallel OpenMP loop with the guided schedule
390  //!
391  //! Specialization for kernels specializing the OmpSchedule trait.
392  //!
393  //! \tparam TKernel The kernel type.
394  template<typename TKernel>
395  struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>
396  {
397  //! Run parallel OpenMP loop
398  //!
399  //! \tparam TLoopBody The loop body functor type.
400  //! \tparam TIdx The index type.
401  //!
402  //! \param loopBody The loop body functor instance, takes iteration index as input.
403  //! \param numIterations The number of loop iterations.
404  //! \param schedule The schedule object.
405  template<typename TLoopBody, typename TIdx>
407  TKernel const&,
408  TLoopBody&& loopBody,
409  TIdx const numIterations,
410  omp::Schedule const& schedule)
411  {
412 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
413  // header.
414  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
415  std::intmax_t i;
416 # pragma omp for nowait schedule(guided, schedule.chunkSize)
417  for(i = 0; i < iNumBlocksInGrid; ++i)
418 # else
419 # pragma omp for nowait schedule(guided, schedule.chunkSize)
420  for(TIdx i = 0; i < numIterations; ++i)
421 # endif
422  {
423  // Make another lambda to work around #1288
424  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
425  wrappedLoopBody(i);
426  }
427  }
428  };
429 
430  //! Helper executor of parallel OpenMP loop with the guided schedule
431  //!
432  //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
433  //!
434  //! \tparam TKernel The kernel type.
435  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
436  template<typename TKernel, typename TSchedule, typename TSfinae = void>
438  {
439  //! Run parallel OpenMP loop
440  //!
441  //! \tparam TLoopBody The loop body functor type.
442  //! \tparam TIdx The index type.
443  //!
444  //! \param loopBody The loop body functor instance, takes iteration index as input.
445  //! \param numIterations The number of loop iterations.
446  template<typename TLoopBody, typename TIdx>
448  TKernel const&,
449  TLoopBody&& loopBody,
450  TIdx const numIterations,
451  TSchedule const&)
452  {
453 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
454  // header.
455  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
456  std::intmax_t i;
457 # pragma omp for nowait schedule(guided)
458  for(i = 0; i < iNumBlocksInGrid; ++i)
459 # else
460 # pragma omp for nowait schedule(guided)
461  for(TIdx i = 0; i < numIterations; ++i)
462 # endif
463  {
464  // Make another lambda to work around #1288
465  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
466  wrappedLoopBody(i);
467  }
468  }
469  };
470 
471  //! Helper executor of parallel OpenMP loop with the guided schedule
472  //!
473  //! Specialization for kernels with ompScheduleChunkSize member.
474  //!
475  //! \tparam TKernel The kernel type.
476  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
477  template<typename TKernel, typename TSchedule>
478  struct ParallelForGuidedImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
479  {
480  //! Run parallel OpenMP loop
481  //!
482  //! \tparam TLoopBody The loop body functor type.
483  //! \tparam TIdx The index type.
484  //!
485  //! \param kernel The kernel instance reference
486  //! \param loopBody The loop body functor instance, takes iteration index as input.
487  //! \param numIterations The number of loop iterations.
488  template<typename TLoopBody, typename TIdx>
490  TKernel const& kernel,
491  TLoopBody&& loopBody,
492  TIdx const numIterations,
493  TSchedule const&)
494  {
495 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
496  // header.
497  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
498  std::intmax_t i;
499 # pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
500  for(i = 0; i < iNumBlocksInGrid; ++i)
501 # else
502 # pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
503  for(TIdx i = 0; i < numIterations; ++i)
504 # endif
505  {
506  // Make another lambda to work around #1288
507  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
508  wrappedLoopBody(i);
509  }
510  }
511  };
512 
513  //! Executor of parallel OpenMP loop with the guided schedule
514  //!
515  //! Specialization for kernels not specializing the OmpSchedule trait.
516  //! Falls back to ParallelForGuidedImpl for further dispatch.
517  //!
518  //! \tparam TKernel The kernel type.
519  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
520  template<typename TKernel, typename TSchedule>
521  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Guided> : ParallelForGuidedImpl<TKernel, TSchedule>
522  {
523  };
524 
525 # if _OPENMP >= 200805
526  //! Executor of parallel OpenMP loop with auto schedule set
527  //!
528  //! Does not use chunk size.
529  //!
530  //! \tparam TKernel The kernel type.
531  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
532  template<typename TKernel, typename TSchedule>
533  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Auto>
534  {
535  //! Run parallel OpenMP loop
536  //!
537  //! \tparam TLoopBody The loop body functor type.
538  //! \tparam TIdx The index type.
539  //!
540  //! \param loopBody The loop body functor instance, takes iteration index as input.
541  //! \param numIterations The number of loop iterations.
542  template<typename TLoopBody, typename TIdx>
544  TKernel const&,
545  TLoopBody&& loopBody,
546  TIdx const numIterations,
547  TSchedule const&)
548  {
549 # pragma omp for nowait schedule(auto)
550  for(TIdx i = 0; i < numIterations; ++i)
551  {
552  // Make another lambda to work around #1288
553  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
554  wrappedLoopBody(i);
555  }
556  }
557  };
558 # endif
559 
560  //! Executor of parallel OpenMP loop with runtime schedule set
561  //!
562  //! Does not use chunk size.
563  //!
564  //! \tparam TKernel The kernel type.
565  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
566  template<typename TKernel, typename TSchedule>
567  struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Runtime>
568  {
569  //! Run parallel OpenMP loop
570  //!
571  //! \tparam TLoopBody The loop body functor type.
572  //! \tparam TIdx The index type.
573  //!
574  //! \param loopBody The loop body functor instance, takes iteration index as input.
575  //! \param numIterations The number of loop iterations.
576  template<typename TLoopBody, typename TIdx>
578  TKernel const&,
579  TLoopBody&& loopBody,
580  TIdx const numIterations,
581  TSchedule const&)
582  {
583 # if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
584  // header.
585  std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
586  std::intmax_t i;
587 # pragma omp for nowait schedule(runtime)
588  for(i = 0; i < iNumBlocksInGrid; ++i)
589 # else
590 # pragma omp for nowait schedule(runtime)
591  for(TIdx i = 0; i < numIterations; ++i)
592 # endif
593  {
594  // Make another lambda to work around #1288
595  auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
596  wrappedLoopBody(i);
597  }
598  }
599  };
600 
601  //! Executor of parallel OpenMP loop
602  //!
603  //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
604  //! The default implementation is for the kernels that do not set schedule in any way, compile-time dispatch.
605  //!
606  //! \tparam TKernel The kernel type.
607  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
608  template<typename TKernel, typename TSchedule, typename TSfinae = void>
609  struct ParallelFor
610  {
611  //! Run parallel OpenMP loop
612  //!
613  //! \tparam TLoopBody The loop body functor type.
614  //! \tparam TIdx The index type.
615  //!
616  //! \param kernel The kernel instance reference
617  //! \param loopBody The loop body functor instance, takes iteration index as input.
618  //! \param numIterations The number of loop iterations.
619  //! \param schedule The schedule object.
620  template<typename TLoopBody, typename TIdx>
622  TKernel const& kernel,
623  TLoopBody&& loopBody,
624  TIdx const numIterations,
625  TSchedule const& schedule)
626  {
627  // Forward to ParallelForImpl that performs dispatch by by chunk size
629  kernel,
630  std::forward<TLoopBody>(loopBody),
631  numIterations,
632  schedule);
633  }
634  };
635 
636  //! Executor of parallel OpenMP loop
637  //!
638  //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
639  //! Specialization for kernels specializing the OmpSchedule trait, run-time dispatch.
640  //!
641  //! \tparam TKernel The kernel type.
642  template<typename TKernel>
643  struct ParallelFor<TKernel, omp::Schedule>
644  {
645  //! Run parallel OpenMP loop
646  //!
647  //! \tparam TLoopBody The loop body functor type.
648  //! \tparam TIdx The index type.
649  //!
650  //! \param kernel The kernel instance reference
651  //! \param loopBody The loop body functor instance, takes iteration index as input.
652  //! \param numIterations The number of loop iterations.
653  //! \param schedule The schedule object.
654  template<typename TLoopBody, typename TIdx>
656  TKernel const& kernel,
657  TLoopBody&& loopBody,
658  TIdx const numIterations,
659  omp::Schedule const& schedule)
660  {
661  // Forward to ParallelForImpl that performs dispatch by by chunk size
662  switch(schedule.kind)
663  {
666  kernel,
667  std::forward<TLoopBody>(loopBody),
668  numIterations,
669  schedule);
670  break;
673  kernel,
674  std::forward<TLoopBody>(loopBody),
675  numIterations,
676  schedule);
677  break;
680  kernel,
681  std::forward<TLoopBody>(loopBody),
682  numIterations,
683  schedule);
684  break;
687  kernel,
688  std::forward<TLoopBody>(loopBody),
689  numIterations,
690  schedule);
691  break;
692 # if _OPENMP >= 200805
693  case omp::Schedule::Auto:
695  kernel,
696  std::forward<TLoopBody>(loopBody),
697  numIterations,
698  schedule);
699  break;
700 # endif
703  kernel,
704  std::forward<TLoopBody>(loopBody),
705  numIterations,
706  schedule);
707  break;
708  }
709  }
710  };
711 
712  //! Helper type to check if TSchedule is a type originating from OmpSchedule trait definition
713  //!
714  //! \tparam TSchedule The schedule type.
715  template<typename TSchedule>
717  = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
718 
719  //! Helper type to check if member ompScheduleKind of TKernel should be used
720  //!
721  //! For that it has to be present, and no OmpSchedule trait specialized.
722  //! Is void for those types, ill-formed otherwise.
723  //!
724  //! \tparam TKernel The kernel type.
725  //! \tparam TSchedule The schedule type.
726  template<typename TKernel, typename TSchedule>
728  = std::enable_if_t<sizeof(TKernel::ompScheduleKind) && !IsOmpScheduleTraitSpecialized<TSchedule>::value>;
729 
730  //! Executor of parallel OpenMP loop
731  //!
732  //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
733  //! Specialization for kernels with ompScheduleKind member, compile-time dispatch.
734  //!
735  //! \tparam TKernel The kernel type.
736  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
737  template<typename TKernel, typename TSchedule>
738  struct ParallelFor<TKernel, TSchedule, UseScheduleKind<TKernel, TSchedule>>
739  {
740  //! Run parallel OpenMP loop
741  //!
742  //! \tparam TLoopBody The loop body functor type.
743  //! \tparam TIdx The index type.
744  //!
745  //! \param kernel The kernel instance reference
746  //! \param loopBody The loop body functor instance, takes iteration index as input.
747  //! \param numIterations The number of loop iterations.
748  //! \param schedule The schedule object.
749  template<typename TLoopBody, typename TIdx>
751  TKernel const& kernel,
752  TLoopBody&& loopBody,
753  TIdx const numIterations,
754  TSchedule const& schedule)
755  {
756  // Forward to ParallelForImpl that performs dispatch by by chunk size
758  kernel,
759  std::forward<TLoopBody>(loopBody),
760  numIterations,
761  schedule);
762  }
763  };
764 
765  //! Run parallel OpenMP loop
766  //!
767  //! \tparam TKernel The kernel type.
768  //! \tparam TLoopBody The loop body functor type.
769  //! \tparam TIdx The index type.
770  //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
771  //!
772  //! \param kernel The kernel instance reference,
773  //! not perfect=forwarded to shorten SFINAE internally.
774  //! \param loopBody The loop body functor instance, takes iteration index as input.
775  //! \param numIterations The number of loop iterations.
776  //! \param schedule The schedule object.
777  template<typename TKernel, typename TLoopBody, typename TIdx, typename TSchedule>
779  TKernel const& kernel,
780  TLoopBody&& loopBody,
781  TIdx const numIterations,
782  TSchedule const& schedule)
783  {
784  // Forward to ParallelFor that performs first a dispatch by schedule kind, and then by chunk size
785  ParallelFor<TKernel, TSchedule>{}(kernel, std::forward<TLoopBody>(loopBody), numIterations, schedule);
786  }
787 
788  } // namespace detail
789 
790  //! The CPU OpenMP 2.0 block accelerator execution task.
791  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
792  class TaskKernelCpuOmp2Blocks final : public WorkDivMembers<TDim, TIdx>
793  {
794  public:
795  template<typename TWorkDiv>
796  ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
797  : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
798  , m_kernelFnObj(kernelFnObj)
799  , m_args(std::forward<TArgs>(args)...)
800  {
801  static_assert(
802  Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
803  "The work division and the execution task have to be of the same dimensionality!");
804  }
805 
806  //! Executes the kernel function object.
807  ALPAKA_FN_HOST auto operator()() const -> void
808  {
810 
811  auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
812  auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
813  auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
814 
815  // Get the size of the block shared dynamic memory.
816  auto const blockSharedMemDynSizeBytes = std::apply(
817  [&](std::decay_t<TArgs> const&... args)
818  {
819  return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
820  m_kernelFnObj,
821  blockThreadExtent,
822  threadElemExtent,
823  args...);
824  },
825  m_args);
826 
827 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
828  std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
829  << std::endl;
830 # endif
831 
832  // The number of blocks in the grid.
833  TIdx const numBlocksInGrid(gridBlockExtent.prod());
834 
835  // Get the OpenMP schedule information for the given kernel and parameter types
836  auto const schedule = std::apply(
837  [&](std::decay_t<TArgs> const&... args) {
838  return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
839  m_kernelFnObj,
840  blockThreadExtent,
841  threadElemExtent,
842  args...);
843  },
844  m_args);
845 
846  if(::omp_in_parallel() != 0)
847  {
848 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
849  std::cout << __func__ << " already within a parallel region." << std::endl;
850 # endif
851  parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
852  }
853  else
854  {
855 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
856  std::cout << __func__ << " opening new parallel region." << std::endl;
857 # endif
858 # pragma omp parallel
859  parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
860  }
861  }
862 
863  private:
864  template<typename TSchedule>
865  ALPAKA_FN_HOST auto parallelFn(
866  std::size_t const& blockSharedMemDynSizeBytes,
867  TIdx const& numBlocksInGrid,
868  Vec<TDim, TIdx> const& gridBlockExtent,
869  TSchedule const& schedule) const -> void
870  {
871 # pragma omp single nowait
872  {
873  // The OpenMP runtime does not create a parallel region when either:
874  // * only one thread is required in the num_threads clause
875  // * or only one thread is available
876  // In all other cases we expect to be in a parallel region now.
877  if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
878  {
879  throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
880  }
881 
882 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
883  std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
884 # endif
885  }
886 
887  AccCpuOmp2Blocks<TDim, TIdx> acc(
888  *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
889  blockSharedMemDynSizeBytes);
890 
891  // Body of the OpenMP parallel loop to be executed.
892  // Index type is auto since we have a difference for OpenMP 2.0 and later ones
893  auto loopBody = [&](auto currentIndex)
894  {
895 # if _OPENMP < 200805
896  auto const i_tidx = static_cast<TIdx>(currentIndex); // for issue #840
897  auto const index = Vec<DimInt<1u>, TIdx>(i_tidx); // for issue #840
898 # else
899  auto const index = Vec<DimInt<1u>, TIdx>(currentIndex); // for issue #840
900 # endif
901  acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
902 
903  std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
904 
905  // After a block has been processed, the shared memory has to be deleted.
906  freeSharedVars(acc);
907  };
908 
909  detail::parallelFor(m_kernelFnObj, loopBody, numBlocksInGrid, schedule);
910  }
911 
912  TKernelFnObj m_kernelFnObj;
913  std::tuple<std::decay_t<TArgs>...> m_args;
914  };
915 
916  namespace trait
917  {
918  //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
919  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
920  struct AccType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
921  {
923  };
924 
925  //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
926  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
927  struct DevType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
928  {
929  using type = DevCpu;
930  };
931 
932  //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
933  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
934  struct DimType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
935  {
936  using type = TDim;
937  };
938 
939  //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
940  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
941  struct PlatformType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
942  {
943  using type = PlatformCpu;
944  };
945 
946  //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
947  template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
948  struct IdxType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
949  {
950  using type = TIdx;
951  };
952 
953  //! \brief Specialisation of the class template FunctionAttributes
954  //! \tparam TDev The device type.
955  //! \tparam TDim The dimensionality of the accelerator device properties.
956  //! \tparam TIdx The idx type of the accelerator device properties.
957  //! \tparam TKernelFn Kernel function object type.
958  //! \tparam TArgs Kernel function object argument types as a parameter pack.
959  template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
960  struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
961  {
962  //! \param dev The device instance
963  //! \param kernelFn The kernel function object which should be executed.
964  //! \param args The kernel invocation arguments.
965  //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
966  //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
968  TDev const& dev,
969  [[maybe_unused]] TKernelFn const& kernelFn,
970  [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
971  {
972  alpaka::KernelFunctionAttributes kernelFunctionAttributes;
973 
974  // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
975  // properties function.
976  auto const& props = alpaka::getAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>(dev);
977  kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
978  kernelFunctionAttributes.maxDynamicSharedSizeBytes
979  = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
980  return kernelFunctionAttributes;
981  }
982  };
983 
984  } // namespace trait
985 } // namespace alpaka
986 
987 # if BOOST_COMP_CLANG
988 # pragma clang diagnostic pop
989 # endif
990 
991 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
The CPU OpenMP 2.0 block accelerator.
The CPU device handle.
Definition: DevCpu.hpp:56
The CPU OpenMP 2.0 block accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
Definition: Common.hpp:95
std::void_t< decltype(TKernel::ompScheduleChunkSize)> HasScheduleChunkSize
Helper type to check if TKernel has member ompScheduleChunkSize.
std::integral_constant< bool, std::is_same< TSchedule, omp::Schedule >::value > IsOmpScheduleTraitSpecialized
Helper type to check if TSchedule is a type originating from OmpSchedule trait definition.
ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
std::enable_if_t< sizeof(TKernel::ompScheduleKind) &&!IsOmpScheduleTraitSpecialized< TSchedule >::value > UseScheduleKind
Helper type to check if member ompScheduleKind of TKernel should be used.
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition: Traits.hpp:54
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
The CPU device platform.
Definition: PlatformCpu.hpp:18
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the given schedule.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless...
Definition: OmpSchedule.hpp:20
int chunkSize
Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a fixed-w...
Definition: OmpSchedule.hpp:44
Kind kind
Schedule kind.
Definition: OmpSchedule.hpp:40
The accelerator type trait.
Definition: Traits.hpp:37
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, [[maybe_unused]] TKernelFn const &kernelFn, [[maybe_unused]] TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.
Definition: Traits.hpp:79
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30