28 #include <type_traits>
30 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
34 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
37 # pragma clang diagnostic push
38 # pragma clang diagnostic ignored "-Wswitch-default"
42 # error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
58 template<
typename TKernel,
typename TSchedule, omp::Schedule::Kind TScheduleKind>
67 template<
typename TKernel,
typename TSchedule>
77 template<
typename TLoopBody,
typename TIdx>
81 TIdx
const numIterations,
86 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
88 # pragma omp for nowait
89 for(i = 0; i < iNumBlocksInGrid; ++i)
91 # pragma omp
for nowait
92 for(TIdx i = 0; i < numIterations; ++i)
96 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
114 template<
typename TKernel>
125 template<
typename TLoopBody,
typename TIdx>
128 TLoopBody&& loopBody,
129 TIdx
const numIterations,
132 # if _OPENMP < 200805
134 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
136 # pragma omp for nowait schedule(static, schedule.chunkSize)
137 for(i = 0; i < iNumBlocksInGrid; ++i)
139 # pragma omp
for nowait schedule(
static, schedule.
chunkSize)
140 for(TIdx i = 0; i < numIterations; ++i)
144 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
156 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
166 template<
typename TLoopBody,
typename TIdx>
169 TLoopBody&& loopBody,
170 TIdx
const numIterations,
173 # if _OPENMP < 200805
175 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
177 # pragma omp for nowait schedule(static)
178 for(i = 0; i < iNumBlocksInGrid; ++i)
180 # pragma omp
for nowait schedule(
static)
181 for(TIdx i = 0; i < numIterations; ++i)
185 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
196 template<
typename TKernel>
205 template<
typename TKernel,
typename TSchedule>
216 template<
typename TLoopBody,
typename TIdx>
218 TKernel
const& kernel,
219 TLoopBody&& loopBody,
220 TIdx
const numIterations,
223 # if _OPENMP < 200805
225 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
227 # pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
228 for(i = 0; i < iNumBlocksInGrid; ++i)
230 # pragma omp
for nowait schedule(
static, kernel.ompScheduleChunkSize)
231 for(TIdx i = 0; i < numIterations; ++i)
235 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
248 template<
typename TKernel,
typename TSchedule>
258 template<
typename TKernel>
269 template<
typename TLoopBody,
typename TIdx>
272 TLoopBody&& loopBody,
273 TIdx
const numIterations,
276 # if _OPENMP < 200805
278 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
280 # pragma omp for nowait schedule(dynamic, schedule.chunkSize)
281 for(i = 0; i < iNumBlocksInGrid; ++i)
283 # pragma omp
for nowait schedule(dynamic, schedule.
chunkSize)
284 for(TIdx i = 0; i < numIterations; ++i)
288 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
300 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
310 template<
typename TLoopBody,
typename TIdx>
313 TLoopBody&& loopBody,
314 TIdx
const numIterations,
317 # if _OPENMP < 200805
319 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
321 # pragma omp for nowait schedule(dynamic)
322 for(i = 0; i < iNumBlocksInGrid; ++i)
324 # pragma omp
for nowait schedule(dynamic)
325 for(TIdx i = 0; i < numIterations; ++i)
329 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
341 template<
typename TKernel,
typename TSchedule>
352 template<
typename TLoopBody,
typename TIdx>
354 TKernel
const& kernel,
355 TLoopBody&& loopBody,
356 TIdx
const numIterations,
359 # if _OPENMP < 200805
361 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
363 # pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
364 for(i = 0; i < iNumBlocksInGrid; ++i)
366 # pragma omp
for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
367 for(TIdx i = 0; i < numIterations; ++i)
371 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
384 template<
typename TKernel,
typename TSchedule>
394 template<
typename TKernel>
405 template<
typename TLoopBody,
typename TIdx>
408 TLoopBody&& loopBody,
409 TIdx
const numIterations,
412 # if _OPENMP < 200805
414 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
416 # pragma omp for nowait schedule(guided, schedule.chunkSize)
417 for(i = 0; i < iNumBlocksInGrid; ++i)
419 # pragma omp
for nowait schedule(guided, schedule.
chunkSize)
420 for(TIdx i = 0; i < numIterations; ++i)
424 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
436 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
446 template<
typename TLoopBody,
typename TIdx>
449 TLoopBody&& loopBody,
450 TIdx
const numIterations,
453 # if _OPENMP < 200805
455 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
457 # pragma omp for nowait schedule(guided)
458 for(i = 0; i < iNumBlocksInGrid; ++i)
460 # pragma omp
for nowait schedule(guided)
461 for(TIdx i = 0; i < numIterations; ++i)
465 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
477 template<
typename TKernel,
typename TSchedule>
488 template<
typename TLoopBody,
typename TIdx>
490 TKernel
const& kernel,
491 TLoopBody&& loopBody,
492 TIdx
const numIterations,
495 # if _OPENMP < 200805
497 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
499 # pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
500 for(i = 0; i < iNumBlocksInGrid; ++i)
502 # pragma omp
for nowait schedule(guided, kernel.ompScheduleChunkSize)
503 for(TIdx i = 0; i < numIterations; ++i)
507 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
520 template<
typename TKernel,
typename TSchedule>
525 # if _OPENMP >= 200805
532 template<
typename TKernel,
typename TSchedule>
542 template<
typename TLoopBody,
typename TIdx>
545 TLoopBody&& loopBody,
546 TIdx
const numIterations,
549 # pragma omp for nowait schedule(auto)
550 for(TIdx i = 0; i < numIterations; ++i)
553 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
566 template<
typename TKernel,
typename TSchedule>
576 template<
typename TLoopBody,
typename TIdx>
579 TLoopBody&& loopBody,
580 TIdx
const numIterations,
583 # if _OPENMP < 200805
585 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
587 # pragma omp for nowait schedule(runtime)
588 for(i = 0; i < iNumBlocksInGrid; ++i)
590 # pragma omp
for nowait schedule(runtime)
591 for(TIdx i = 0; i < numIterations; ++i)
595 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
608 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
620 template<
typename TLoopBody,
typename TIdx>
622 TKernel
const& kernel,
623 TLoopBody&& loopBody,
624 TIdx
const numIterations,
625 TSchedule
const& schedule)
630 std::forward<TLoopBody>(loopBody),
642 template<
typename TKernel>
654 template<
typename TLoopBody,
typename TIdx>
656 TKernel
const& kernel,
657 TLoopBody&& loopBody,
658 TIdx
const numIterations,
662 switch(schedule.
kind)
667 std::forward<TLoopBody>(loopBody),
674 std::forward<TLoopBody>(loopBody),
681 std::forward<TLoopBody>(loopBody),
688 std::forward<TLoopBody>(loopBody),
692 # if _OPENMP >= 200805
696 std::forward<TLoopBody>(loopBody),
704 std::forward<TLoopBody>(loopBody),
715 template<
typename TSchedule>
717 = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
726 template<
typename TKernel,
typename TSchedule>
737 template<
typename TKernel,
typename TSchedule>
749 template<
typename TLoopBody,
typename TIdx>
751 TKernel
const& kernel,
752 TLoopBody&& loopBody,
753 TIdx
const numIterations,
754 TSchedule
const& schedule)
759 std::forward<TLoopBody>(loopBody),
777 template<
typename TKernel,
typename TLoopBody,
typename TIdx,
typename TSchedule>
779 TKernel
const& kernel,
780 TLoopBody&& loopBody,
781 TIdx
const numIterations,
782 TSchedule
const& schedule)
791 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
795 template<
typename TWorkDiv>
798 , m_kernelFnObj(kernelFnObj)
799 , m_args(std::forward<TArgs>(args)...)
802 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
803 "The work division and the execution task have to be of the same dimensionality!");
811 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*
this);
812 auto const blockThreadExtent = getWorkDiv<Block, Threads>(*
this);
813 auto const threadElemExtent = getWorkDiv<Thread, Elems>(*
this);
816 auto const blockSharedMemDynSizeBytes = std::apply(
817 [&](std::decay_t<TArgs>
const&... args)
819 return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
827 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
828 std::cout << __func__ <<
" blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
833 TIdx
const numBlocksInGrid(gridBlockExtent.prod());
836 auto const schedule = std::apply(
837 [&](std::decay_t<TArgs>
const&... args) {
838 return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
846 if(::omp_in_parallel() != 0)
848 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
849 std::cout << __func__ <<
" already within a parallel region." << std::endl;
851 parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
855 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
856 std::cout << __func__ <<
" opening new parallel region." << std::endl;
858 # pragma omp parallel
859 parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
864 template<
typename TSchedule>
866 std::size_t
const& blockSharedMemDynSizeBytes,
867 TIdx
const& numBlocksInGrid,
869 TSchedule
const& schedule)
const ->
void
871 # pragma omp single nowait
877 if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
879 throw std::runtime_error(
"The OpenMP 2.0 runtime did not create a parallel region!");
882 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
883 std::cout << __func__ <<
" omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
887 AccCpuOmp2Blocks<TDim, TIdx> acc(
888 *
static_cast<WorkDivMembers<TDim, TIdx> const*
>(
this),
889 blockSharedMemDynSizeBytes);
893 auto loopBody = [&](
auto currentIndex)
895 # if _OPENMP < 200805
896 auto const i_tidx =
static_cast<TIdx
>(currentIndex);
897 auto const index = Vec<DimInt<1u>, TIdx>(i_tidx);
899 auto const index = Vec<DimInt<1u>, TIdx>(currentIndex);
901 acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
903 std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
912 TKernelFnObj m_kernelFnObj;
913 std::tuple<std::decay_t<TArgs>...> m_args;
919 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
926 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
933 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
940 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
947 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
959 template<
typename TDev,
typename TDim,
typename TIdx,
typename TKernelFn,
typename... TArgs>
969 [[maybe_unused]] TKernelFn
const& kernelFn,
976 auto const& props = alpaka::getAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>(dev);
977 kernelFunctionAttributes.
maxThreadsPerBlock =
static_cast<int>(props.m_blockThreadCountMax);
980 return kernelFunctionAttributes;
987 # if BOOST_COMP_CLANG
988 # pragma clang diagnostic pop
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
The CPU OpenMP 2.0 block accelerator.
The CPU OpenMP 2.0 block accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
std::void_t< decltype(TKernel::ompScheduleChunkSize)> HasScheduleChunkSize
Helper type to check if TKernel has member ompScheduleChunkSize.
std::integral_constant< bool, std::is_same< TSchedule, omp::Schedule >::value > IsOmpScheduleTraitSpecialized
Helper type to check if TSchedule is a type originating from OmpSchedule trait definition.
ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
std::enable_if_t< sizeof(TKernel::ompScheduleKind) &&!IsOmpScheduleTraitSpecialized< TSchedule >::value > UseScheduleKind
Helper type to check if member ompScheduleKind of TKernel should be used.
The alpaka accelerator library.
constexpr std::uint32_t BlockSharedDynMemberAllocKiB
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using ...
int maxDynamicSharedSizeBytes
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with no schedule set.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the given schedule.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless...
int chunkSize
Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a fixed-w...
The accelerator type trait.
The dimension getter type trait.
static ALPAKA_FN_HOST auto getFunctionAttributes(TDev const &dev, [[maybe_unused]] TKernelFn const &kernelFn, [[maybe_unused]] TArgs &&... args) -> alpaka::KernelFunctionAttributes
The structure template to access to the functions attributes of a kernel function object.