alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TaskKernelCpuOmp2Blocks.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Bert Wesarg, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7// Specialized traits.
10#include "alpaka/dim/Traits.hpp"
11#include "alpaka/idx/Traits.hpp"
13
14// Implementation details.
16#include "alpaka/core/Decay.hpp"
18#include "alpaka/dev/DevCpu.hpp"
19#include "alpaka/idx/MapIdx.hpp"
24
25#include <functional>
26#include <stdexcept>
27#include <tuple>
28#include <type_traits>
29#include <utility>
30#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
31# include <iostream>
32#endif
33
34#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
35
36# if BOOST_COMP_CLANG
37# pragma clang diagnostic push
38# pragma clang diagnostic ignored "-Wswitch-default"
39# endif
40
41# if _OPENMP < 200203
42# error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
43# endif
44
45# include <omp.h>
46
47namespace alpaka
48{
49 namespace detail
50 {
51 //! Executor of parallel OpenMP loop with the given schedule
52 //!
53 //! Is explicitly specialized for all supported schedule kinds to help code optimization by compilers.
54 //!
55 //! \tparam TKernel The kernel type.
56 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
57 //! \tparam TScheduleKind The schedule kind value.
58 template<typename TKernel, typename TSchedule, omp::Schedule::Kind TScheduleKind>
60
61 //! Executor of parallel OpenMP loop with no schedule set
62 //!
63 //! Does not use chunk size.
64 //!
65 //! \tparam TKernel The kernel type.
66 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
67 template<typename TKernel, typename TSchedule>
68 struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>
69 {
70 //! Run parallel OpenMP loop
71 //!
72 //! \tparam TLoopBody The loop body functor type.
73 //! \tparam TIdx The index type.
74 //!
75 //! \param loopBody The loop body functor instance, takes iteration index as input.
76 //! \param numIterations The number of loop iterations.
77 template<typename TLoopBody, typename TIdx>
79 TKernel const&,
80 TLoopBody&& loopBody,
81 TIdx const numIterations,
82 TSchedule const&)
83 {
84# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
85 // header.
86 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
87 std::intmax_t i;
88# pragma omp for nowait
89 for(i = 0; i < iNumBlocksInGrid; ++i)
90# else
91# pragma omp for nowait
92 for(TIdx i = 0; i < numIterations; ++i)
93# endif
94 {
95 // Make another lambda to work around #1288
96 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
97 wrappedLoopBody(i);
98 }
99 }
100 };
101
102 /* Implementations for Static, Dynamic and Guided follow the same pattern.
103 * There are two specializations of ParallelForImpl for compile-time dispatch depending on whether the
104 * OmpSchedule trait is specialized.
105 * The no trait case is further compile-time dispatched with a helper ParallelForStaticImpl.
106 * It is based on whether ompScheduleChunkSize member is available.
107 */
108
109 //! Executor of parallel OpenMP loop with the static schedule
110 //!
111 //! Specialization for kernels specializing the OmpSchedule trait.
112 //!
113 //! \tparam TKernel The kernel type.
114 template<typename TKernel>
115 struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>
116 {
117 //! Run parallel OpenMP loop
118 //!
119 //! \tparam TLoopBody The loop body functor type.
120 //! \tparam TIdx The index type.
121 //!
122 //! \param loopBody The loop body functor instance, takes iteration index as input.
123 //! \param numIterations The number of loop iterations.
124 //! \param schedule The schedule object.
125 template<typename TLoopBody, typename TIdx>
127 TKernel const&,
128 TLoopBody&& loopBody,
129 TIdx const numIterations,
130 omp::Schedule const& schedule)
131 {
132# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
133 // header.
134 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
135 std::intmax_t i;
136# pragma omp for nowait schedule(static, schedule.chunkSize)
137 for(i = 0; i < iNumBlocksInGrid; ++i)
138# else
139# pragma omp for nowait schedule(static, schedule.chunkSize)
140 for(TIdx i = 0; i < numIterations; ++i)
141# endif
142 {
143 // Make another lambda to work around #1288
144 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
145 wrappedLoopBody(i);
146 }
147 }
148 };
149
150 //! Helper executor of parallel OpenMP loop with the static schedule
151 //!
152 //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
153 //!
154 //! \tparam TKernel The kernel type.
155 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
156 template<typename TKernel, typename TSchedule, typename TSfinae = void>
158 {
159 //! Run parallel OpenMP loop
160 //!
161 //! \tparam TLoopBody The loop body functor type.
162 //! \tparam TIdx The index type.
163 //!
164 //! \param loopBody The loop body functor instance, takes iteration index as input.
165 //! \param numIterations The number of loop iterations.
166 template<typename TLoopBody, typename TIdx>
168 TKernel const&,
169 TLoopBody&& loopBody,
170 TIdx const numIterations,
171 TSchedule const&)
172 {
173# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
174 // header.
175 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
176 std::intmax_t i;
177# pragma omp for nowait schedule(static)
178 for(i = 0; i < iNumBlocksInGrid; ++i)
179# else
180# pragma omp for nowait schedule(static)
181 for(TIdx i = 0; i < numIterations; ++i)
182# endif
183 {
184 // Make another lambda to work around #1288
185 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
186 wrappedLoopBody(i);
187 }
188 }
189 };
190
191 //! Helper type to check if TKernel has member ompScheduleChunkSize
192 //!
193 //! Is void for those types, ill-formed otherwise.
194 //!
195 //! \tparam TKernel The kernel type.
196 template<typename TKernel>
197 using HasScheduleChunkSize = std::void_t<decltype(TKernel::ompScheduleChunkSize)>;
198
199 //! Helper executor of parallel OpenMP loop with the static schedule
200 //!
201 //! Specialization for kernels with ompScheduleChunkSize member.
202 //!
203 //! \tparam TKernel The kernel type.
204 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
205 template<typename TKernel, typename TSchedule>
206 struct ParallelForStaticImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
207 {
208 //! Run parallel OpenMP loop
209 //!
210 //! \tparam TLoopBody The loop body functor type.
211 //! \tparam TIdx The index type.
212 //!
213 //! \param kernel The kernel instance reference
214 //! \param loopBody The loop body functor instance, takes iteration index as input.
215 //! \param numIterations The number of loop iterations.
216 template<typename TLoopBody, typename TIdx>
218 TKernel const& kernel,
219 TLoopBody&& loopBody,
220 TIdx const numIterations,
221 TSchedule const&)
222 {
223# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
224 // header.
225 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
226 std::intmax_t i;
227# pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
228 for(i = 0; i < iNumBlocksInGrid; ++i)
229# else
230# pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
231 for(TIdx i = 0; i < numIterations; ++i)
232# endif
233 {
234 // Make another lambda to work around #1288
235 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
236 wrappedLoopBody(i);
237 }
238 }
239 };
240
241 //! Executor of parallel OpenMP loop with the static schedule
242 //!
243 //! Specialization for kernels not specializing the OmpSchedule trait.
244 //! Falls back to ParallelForStaticImpl for further dispatch.
245 //!
246 //! \tparam TKernel The kernel type.
247 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
248 template<typename TKernel, typename TSchedule>
249 struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Static> : ParallelForStaticImpl<TKernel, TSchedule>
250 {
251 };
252
253 //! Executor of parallel OpenMP loop with the dynamic schedule
254 //!
255 //! Specialization for kernels specializing the OmpSchedule trait.
256 //!
257 //! \tparam TKernel The kernel type.
258 template<typename TKernel>
259 struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>
260 {
261 //! Run parallel OpenMP loop
262 //!
263 //! \tparam TLoopBody The loop body functor type.
264 //! \tparam TIdx The index type.
265 //!
266 //! \param loopBody The loop body functor instance, takes iteration index as input.
267 //! \param numIterations The number of loop iterations.
268 //! \param schedule The schedule object.
269 template<typename TLoopBody, typename TIdx>
271 TKernel const&,
272 TLoopBody&& loopBody,
273 TIdx const numIterations,
274 omp::Schedule const& schedule)
275 {
276# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
277 // header.
278 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
279 std::intmax_t i;
280# pragma omp for nowait schedule(dynamic, schedule.chunkSize)
281 for(i = 0; i < iNumBlocksInGrid; ++i)
282# else
283# pragma omp for nowait schedule(dynamic, schedule.chunkSize)
284 for(TIdx i = 0; i < numIterations; ++i)
285# endif
286 {
287 // Make another lambda to work around #1288
288 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
289 wrappedLoopBody(i);
290 }
291 }
292 };
293
294 //! Helper executor of parallel OpenMP loop with the dynamic schedule
295 //!
296 //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
297 //!
298 //! \tparam TKernel The kernel type.
299 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
300 template<typename TKernel, typename TSchedule, typename TSfinae = void>
302 {
303 //! Run parallel OpenMP loop
304 //!
305 //! \tparam TLoopBody The loop body functor type.
306 //! \tparam TIdx The index type.
307 //!
308 //! \param loopBody The loop body functor instance, takes iteration index as input.
309 //! \param numIterations The number of loop iterations.
310 template<typename TLoopBody, typename TIdx>
312 TKernel const&,
313 TLoopBody&& loopBody,
314 TIdx const numIterations,
315 TSchedule const&)
316 {
317# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
318 // header.
319 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
320 std::intmax_t i;
321# pragma omp for nowait schedule(dynamic)
322 for(i = 0; i < iNumBlocksInGrid; ++i)
323# else
324# pragma omp for nowait schedule(dynamic)
325 for(TIdx i = 0; i < numIterations; ++i)
326# endif
327 {
328 // Make another lambda to work around #1288
329 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
330 wrappedLoopBody(i);
331 }
332 }
333 };
334
335 //! Helper executor of parallel OpenMP loop with the dynamic schedule
336 //!
337 //! Specialization for kernels with ompScheduleChunkSize member.
338 //!
339 //! \tparam TKernel The kernel type.
340 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
341 template<typename TKernel, typename TSchedule>
342 struct ParallelForDynamicImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
343 {
344 //! Run parallel OpenMP loop
345 //!
346 //! \tparam TLoopBody The loop body functor type.
347 //! \tparam TIdx The index type.
348 //!
349 //! \param kernel The kernel instance reference
350 //! \param loopBody The loop body functor instance, takes iteration index as input.
351 //! \param numIterations The number of loop iterations.
352 template<typename TLoopBody, typename TIdx>
354 TKernel const& kernel,
355 TLoopBody&& loopBody,
356 TIdx const numIterations,
357 TSchedule const&)
358 {
359# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
360 // header.
361 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
362 std::intmax_t i;
363# pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
364 for(i = 0; i < iNumBlocksInGrid; ++i)
365# else
366# pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
367 for(TIdx i = 0; i < numIterations; ++i)
368# endif
369 {
370 // Make another lambda to work around #1288
371 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
372 wrappedLoopBody(i);
373 }
374 }
375 };
376
377 //! Executor of parallel OpenMP loop with the dynamic schedule
378 //!
379 //! Specialization for kernels not specializing the OmpSchedule trait.
380 //! Falls back to ParallelForDynamicImpl for further dispatch.
381 //!
382 //! \tparam TKernel The kernel type.
383 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
384 template<typename TKernel, typename TSchedule>
385 struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Dynamic> : ParallelForDynamicImpl<TKernel, TSchedule>
386 {
387 };
388
389 //! Executor of parallel OpenMP loop with the guided schedule
390 //!
391 //! Specialization for kernels specializing the OmpSchedule trait.
392 //!
393 //! \tparam TKernel The kernel type.
394 template<typename TKernel>
395 struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>
396 {
397 //! Run parallel OpenMP loop
398 //!
399 //! \tparam TLoopBody The loop body functor type.
400 //! \tparam TIdx The index type.
401 //!
402 //! \param loopBody The loop body functor instance, takes iteration index as input.
403 //! \param numIterations The number of loop iterations.
404 //! \param schedule The schedule object.
405 template<typename TLoopBody, typename TIdx>
407 TKernel const&,
408 TLoopBody&& loopBody,
409 TIdx const numIterations,
410 omp::Schedule const& schedule)
411 {
412# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
413 // header.
414 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
415 std::intmax_t i;
416# pragma omp for nowait schedule(guided, schedule.chunkSize)
417 for(i = 0; i < iNumBlocksInGrid; ++i)
418# else
419# pragma omp for nowait schedule(guided, schedule.chunkSize)
420 for(TIdx i = 0; i < numIterations; ++i)
421# endif
422 {
423 // Make another lambda to work around #1288
424 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
425 wrappedLoopBody(i);
426 }
427 }
428 };
429
430 //! Helper executor of parallel OpenMP loop with the guided schedule
431 //!
432 //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
433 //!
434 //! \tparam TKernel The kernel type.
435 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
436 template<typename TKernel, typename TSchedule, typename TSfinae = void>
438 {
439 //! Run parallel OpenMP loop
440 //!
441 //! \tparam TLoopBody The loop body functor type.
442 //! \tparam TIdx The index type.
443 //!
444 //! \param loopBody The loop body functor instance, takes iteration index as input.
445 //! \param numIterations The number of loop iterations.
446 template<typename TLoopBody, typename TIdx>
448 TKernel const&,
449 TLoopBody&& loopBody,
450 TIdx const numIterations,
451 TSchedule const&)
452 {
453# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
454 // header.
455 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
456 std::intmax_t i;
457# pragma omp for nowait schedule(guided)
458 for(i = 0; i < iNumBlocksInGrid; ++i)
459# else
460# pragma omp for nowait schedule(guided)
461 for(TIdx i = 0; i < numIterations; ++i)
462# endif
463 {
464 // Make another lambda to work around #1288
465 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
466 wrappedLoopBody(i);
467 }
468 }
469 };
470
471 //! Helper executor of parallel OpenMP loop with the guided schedule
472 //!
473 //! Specialization for kernels with ompScheduleChunkSize member.
474 //!
475 //! \tparam TKernel The kernel type.
476 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
477 template<typename TKernel, typename TSchedule>
478 struct ParallelForGuidedImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
479 {
480 //! Run parallel OpenMP loop
481 //!
482 //! \tparam TLoopBody The loop body functor type.
483 //! \tparam TIdx The index type.
484 //!
485 //! \param kernel The kernel instance reference
486 //! \param loopBody The loop body functor instance, takes iteration index as input.
487 //! \param numIterations The number of loop iterations.
488 template<typename TLoopBody, typename TIdx>
490 TKernel const& kernel,
491 TLoopBody&& loopBody,
492 TIdx const numIterations,
493 TSchedule const&)
494 {
495# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
496 // header.
497 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
498 std::intmax_t i;
499# pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
500 for(i = 0; i < iNumBlocksInGrid; ++i)
501# else
502# pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
503 for(TIdx i = 0; i < numIterations; ++i)
504# endif
505 {
506 // Make another lambda to work around #1288
507 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
508 wrappedLoopBody(i);
509 }
510 }
511 };
512
513 //! Executor of parallel OpenMP loop with the guided schedule
514 //!
515 //! Specialization for kernels not specializing the OmpSchedule trait.
516 //! Falls back to ParallelForGuidedImpl for further dispatch.
517 //!
518 //! \tparam TKernel The kernel type.
519 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
520 template<typename TKernel, typename TSchedule>
521 struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Guided> : ParallelForGuidedImpl<TKernel, TSchedule>
522 {
523 };
524
525# if _OPENMP >= 200805
526 //! Executor of parallel OpenMP loop with auto schedule set
527 //!
528 //! Does not use chunk size.
529 //!
530 //! \tparam TKernel The kernel type.
531 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
532 template<typename TKernel, typename TSchedule>
533 struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Auto>
534 {
535 //! Run parallel OpenMP loop
536 //!
537 //! \tparam TLoopBody The loop body functor type.
538 //! \tparam TIdx The index type.
539 //!
540 //! \param loopBody The loop body functor instance, takes iteration index as input.
541 //! \param numIterations The number of loop iterations.
542 template<typename TLoopBody, typename TIdx>
544 TKernel const&,
545 TLoopBody&& loopBody,
546 TIdx const numIterations,
547 TSchedule const&)
548 {
549# pragma omp for nowait schedule(auto)
550 for(TIdx i = 0; i < numIterations; ++i)
551 {
552 // Make another lambda to work around #1288
553 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
554 wrappedLoopBody(i);
555 }
556 }
557 };
558# endif
559
560 //! Executor of parallel OpenMP loop with runtime schedule set
561 //!
562 //! Does not use chunk size.
563 //!
564 //! \tparam TKernel The kernel type.
565 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
566 template<typename TKernel, typename TSchedule>
567 struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Runtime>
568 {
569 //! Run parallel OpenMP loop
570 //!
571 //! \tparam TLoopBody The loop body functor type.
572 //! \tparam TIdx The index type.
573 //!
574 //! \param loopBody The loop body functor instance, takes iteration index as input.
575 //! \param numIterations The number of loop iterations.
576 template<typename TLoopBody, typename TIdx>
578 TKernel const&,
579 TLoopBody&& loopBody,
580 TIdx const numIterations,
581 TSchedule const&)
582 {
583# if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
584 // header.
585 std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
586 std::intmax_t i;
587# pragma omp for nowait schedule(runtime)
588 for(i = 0; i < iNumBlocksInGrid; ++i)
589# else
590# pragma omp for nowait schedule(runtime)
591 for(TIdx i = 0; i < numIterations; ++i)
592# endif
593 {
594 // Make another lambda to work around #1288
595 auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
596 wrappedLoopBody(i);
597 }
598 }
599 };
600
601 //! Executor of parallel OpenMP loop
602 //!
603 //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
604 //! The default implementation is for the kernels that do not set schedule in any way, compile-time dispatch.
605 //!
606 //! \tparam TKernel The kernel type.
607 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
608 template<typename TKernel, typename TSchedule, typename TSfinae = void>
610 {
611 //! Run parallel OpenMP loop
612 //!
613 //! \tparam TLoopBody The loop body functor type.
614 //! \tparam TIdx The index type.
615 //!
616 //! \param kernel The kernel instance reference
617 //! \param loopBody The loop body functor instance, takes iteration index as input.
618 //! \param numIterations The number of loop iterations.
619 //! \param schedule The schedule object.
620 template<typename TLoopBody, typename TIdx>
622 TKernel const& kernel,
623 TLoopBody&& loopBody,
624 TIdx const numIterations,
625 TSchedule const& schedule)
626 {
627 // Forward to ParallelForImpl that performs dispatch by by chunk size
629 kernel,
630 std::forward<TLoopBody>(loopBody),
631 numIterations,
632 schedule);
633 }
634 };
635
636 //! Executor of parallel OpenMP loop
637 //!
638 //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
639 //! Specialization for kernels specializing the OmpSchedule trait, run-time dispatch.
640 //!
641 //! \tparam TKernel The kernel type.
642 template<typename TKernel>
643 struct ParallelFor<TKernel, omp::Schedule>
644 {
645 //! Run parallel OpenMP loop
646 //!
647 //! \tparam TLoopBody The loop body functor type.
648 //! \tparam TIdx The index type.
649 //!
650 //! \param kernel The kernel instance reference
651 //! \param loopBody The loop body functor instance, takes iteration index as input.
652 //! \param numIterations The number of loop iterations.
653 //! \param schedule The schedule object.
654 template<typename TLoopBody, typename TIdx>
656 TKernel const& kernel,
657 TLoopBody&& loopBody,
658 TIdx const numIterations,
659 omp::Schedule const& schedule)
660 {
661 // Forward to ParallelForImpl that performs dispatch by by chunk size
662 switch(schedule.kind)
663 {
666 kernel,
667 std::forward<TLoopBody>(loopBody),
668 numIterations,
669 schedule);
670 break;
673 kernel,
674 std::forward<TLoopBody>(loopBody),
675 numIterations,
676 schedule);
677 break;
680 kernel,
681 std::forward<TLoopBody>(loopBody),
682 numIterations,
683 schedule);
684 break;
687 kernel,
688 std::forward<TLoopBody>(loopBody),
689 numIterations,
690 schedule);
691 break;
692# if _OPENMP >= 200805
695 kernel,
696 std::forward<TLoopBody>(loopBody),
697 numIterations,
698 schedule);
699 break;
700# endif
703 kernel,
704 std::forward<TLoopBody>(loopBody),
705 numIterations,
706 schedule);
707 break;
708 }
709 }
710 };
711
712 //! Helper type to check if TSchedule is a type originating from OmpSchedule trait definition
713 //!
714 //! \tparam TSchedule The schedule type.
715 template<typename TSchedule>
717 = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
718
719 //! Helper type to check if member ompScheduleKind of TKernel should be used
720 //!
721 //! For that it has to be present, and no OmpSchedule trait specialized.
722 //! Is void for those types, ill-formed otherwise.
723 //!
724 //! \tparam TKernel The kernel type.
725 //! \tparam TSchedule The schedule type.
726 template<typename TKernel, typename TSchedule>
728 = std::enable_if_t<sizeof(TKernel::ompScheduleKind) && !IsOmpScheduleTraitSpecialized<TSchedule>::value>;
729
730 //! Executor of parallel OpenMP loop
731 //!
732 //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
733 //! Specialization for kernels with ompScheduleKind member, compile-time dispatch.
734 //!
735 //! \tparam TKernel The kernel type.
736 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
737 template<typename TKernel, typename TSchedule>
738 struct ParallelFor<TKernel, TSchedule, UseScheduleKind<TKernel, TSchedule>>
739 {
740 //! Run parallel OpenMP loop
741 //!
742 //! \tparam TLoopBody The loop body functor type.
743 //! \tparam TIdx The index type.
744 //!
745 //! \param kernel The kernel instance reference
746 //! \param loopBody The loop body functor instance, takes iteration index as input.
747 //! \param numIterations The number of loop iterations.
748 //! \param schedule The schedule object.
749 template<typename TLoopBody, typename TIdx>
751 TKernel const& kernel,
752 TLoopBody&& loopBody,
753 TIdx const numIterations,
754 TSchedule const& schedule)
755 {
756 // Forward to ParallelForImpl that performs dispatch by by chunk size
758 kernel,
759 std::forward<TLoopBody>(loopBody),
760 numIterations,
761 schedule);
762 }
763 };
764
765 //! Run parallel OpenMP loop
766 //!
767 //! \tparam TKernel The kernel type.
768 //! \tparam TLoopBody The loop body functor type.
769 //! \tparam TIdx The index type.
770 //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
771 //!
772 //! \param kernel The kernel instance reference,
773 //! not perfect=forwarded to shorten SFINAE internally.
774 //! \param loopBody The loop body functor instance, takes iteration index as input.
775 //! \param numIterations The number of loop iterations.
776 //! \param schedule The schedule object.
777 template<typename TKernel, typename TLoopBody, typename TIdx, typename TSchedule>
779 TKernel const& kernel,
780 TLoopBody&& loopBody,
781 TIdx const numIterations,
782 TSchedule const& schedule)
783 {
784 // Forward to ParallelFor that performs first a dispatch by schedule kind, and then by chunk size
785 ParallelFor<TKernel, TSchedule>{}(kernel, std::forward<TLoopBody>(loopBody), numIterations, schedule);
786 }
787
788 } // namespace detail
789
790 //! The CPU OpenMP 2.0 block accelerator execution task.
791 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
792 class TaskKernelCpuOmp2Blocks final : public WorkDivMembers<TDim, TIdx>
793 {
794 public:
795 template<typename TWorkDiv>
796 ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
797 : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
798 , m_kernelFnObj(kernelFnObj)
799 , m_args(std::forward<TArgs>(args)...)
800 {
801 static_assert(
802 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
803 "The work division and the execution task have to be of the same dimensionality!");
804 }
805
806 //! Executes the kernel function object.
807 ALPAKA_FN_HOST auto operator()() const -> void
808 {
810
811 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
812 auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
813 auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
814
815 // Get the size of the block shared dynamic memory.
816 auto const blockSharedMemDynSizeBytes = std::apply(
817 [&](std::decay_t<TArgs> const&... args)
818 {
819 return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
820 m_kernelFnObj,
821 blockThreadExtent,
822 threadElemExtent,
823 args...);
824 },
825 m_args);
826
827# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
828 std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
829 << std::endl;
830# endif
831
832 // The number of blocks in the grid.
833 TIdx const numBlocksInGrid(gridBlockExtent.prod());
834
835 // Get the OpenMP schedule information for the given kernel and parameter types
836 auto const schedule = std::apply(
837 [&](std::decay_t<TArgs> const&... args) {
838 return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
839 m_kernelFnObj,
840 blockThreadExtent,
841 threadElemExtent,
842 args...);
843 },
844 m_args);
845
846 if(::omp_in_parallel() != 0)
847 {
848# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
849 std::cout << __func__ << " already within a parallel region." << std::endl;
850# endif
851 parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
852 }
853 else
854 {
855# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
856 std::cout << __func__ << " opening new parallel region." << std::endl;
857# endif
858# pragma omp parallel
859 parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
860 }
861 }
862
863 private:
864 template<typename TSchedule>
865 ALPAKA_FN_HOST auto parallelFn(
866 std::size_t const& blockSharedMemDynSizeBytes,
867 TIdx const& numBlocksInGrid,
868 Vec<TDim, TIdx> const& gridBlockExtent,
869 TSchedule const& schedule) const -> void
870 {
871# pragma omp single nowait
872 {
873 // The OpenMP runtime does not create a parallel region when either:
874 // * only one thread is required in the num_threads clause
875 // * or only one thread is available
876 // In all other cases we expect to be in a parallel region now.
877 if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
878 {
879 throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
880 }
881
882# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
883 std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
884# endif
885 }
886
887 AccCpuOmp2Blocks<TDim, TIdx> acc(
888 *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
889 blockSharedMemDynSizeBytes);
890
891 // Body of the OpenMP parallel loop to be executed.
892 // Index type is auto since we have a difference for OpenMP 2.0 and later ones
893 auto loopBody = [&](auto currentIndex)
894 {
895# if _OPENMP < 200805
896 auto const i_tidx = static_cast<TIdx>(currentIndex); // for issue #840
897 auto const index = Vec<DimInt<1u>, TIdx>(i_tidx); // for issue #840
898# else
899 auto const index = Vec<DimInt<1u>, TIdx>(currentIndex); // for issue #840
900# endif
901 acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
902
903 std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
904
905 // After a block has been processed, the shared memory has to be deleted.
906 freeSharedVars(acc);
907 };
908
909 detail::parallelFor(m_kernelFnObj, loopBody, numBlocksInGrid, schedule);
910 }
911
912 TKernelFnObj m_kernelFnObj;
913 std::tuple<std::decay_t<TArgs>...> m_args;
914 };
915
916 namespace trait
917 {
918 //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
919 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
920 struct AccType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
921 {
922 using type = AccCpuOmp2Blocks<TDim, TIdx>;
923 };
924
925 //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
926 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
927 struct DevType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
928 {
929 using type = DevCpu;
930 };
931
932 //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
933 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
934 struct DimType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
935 {
936 using type = TDim;
937 };
938
939 //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
940 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
941 struct PlatformType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
942 {
943 using type = PlatformCpu;
944 };
945
946 //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
947 template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
948 struct IdxType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
949 {
950 using type = TIdx;
951 };
952
953 //! \brief Specialisation of the class template FunctionAttributes
954 //! \tparam TDev The device type.
955 //! \tparam TDim The dimensionality of the accelerator device properties.
956 //! \tparam TIdx The idx type of the accelerator device properties.
957 //! \tparam TKernelFn Kernel function object type.
958 //! \tparam TArgs Kernel function object argument types as a parameter pack.
959 template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
960 struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
961 {
962 //! \param dev The device instance
963 //! \param kernelFn The kernel function object which should be executed.
964 //! \param args The kernel invocation arguments.
965 //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
966 //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
968 TDev const& dev,
969 [[maybe_unused]] TKernelFn const& kernelFn,
970 [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
971 {
972 alpaka::KernelFunctionAttributes kernelFunctionAttributes;
973
974 // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
975 // properties function.
976 auto const& props = alpaka::getAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>(dev);
977 kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
978 kernelFunctionAttributes.maxDynamicSharedSizeBytes
979 = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
980 return kernelFunctionAttributes;
981 }
982 };
983
984 } // namespace trait
985} // namespace alpaka
986
987# if BOOST_COMP_CLANG
988# pragma clang diagnostic pop
989# endif
990
991#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
The CPU OpenMP 2.0 block accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A n-dimensional vector.
Definition Vec.hpp:38
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
Definition Common.hpp:95
std::void_t< decltype(TKernel::ompScheduleChunkSize)> HasScheduleChunkSize
Helper type to check if TKernel has member ompScheduleChunkSize.
std::integral_constant< bool, std::is_same< TSchedule, omp::Schedule >::value > IsOmpScheduleTraitSpecialized
Helper type to check if TSchedule is a type originating from OmpSchedule trait definition.
ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
std::enable_if_t< sizeof(TKernel::ompScheduleKind) &&!IsOmpScheduleTraitSpecialized< TSchedule >::value > UseScheduleKind
Helper type to check if member ompScheduleKind of TKernel should be used.
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Definition Traits.hpp:54
STL namespace.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the given schedule.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless...
int chunkSize
Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a fixed-w...
Kind kind
Schedule kind.
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, TKernelFnObj const &kernelFn, TArgs &&... args) -> alpaka::KernelFunctionAttributes
Definition Traits.hpp:85