26 #include <type_traits>
28 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
32 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
35 # error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
51 template<
typename TKernel,
typename TSchedule, omp::Schedule::Kind TScheduleKind>
60 template<
typename TKernel,
typename TSchedule>
70 template<
typename TLoopBody,
typename TIdx>
74 TIdx
const numIterations,
79 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
81 # pragma omp for nowait
82 for(i = 0; i < iNumBlocksInGrid; ++i)
84 # pragma omp
for nowait
85 for(TIdx i = 0; i < numIterations; ++i)
89 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
107 template<
typename TKernel>
118 template<
typename TLoopBody,
typename TIdx>
121 TLoopBody&& loopBody,
122 TIdx
const numIterations,
125 # if _OPENMP < 200805
127 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
129 # pragma omp for nowait schedule(static, schedule.chunkSize)
130 for(i = 0; i < iNumBlocksInGrid; ++i)
132 # pragma omp
for nowait schedule(
static, schedule.
chunkSize)
133 for(TIdx i = 0; i < numIterations; ++i)
137 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
149 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
159 template<
typename TLoopBody,
typename TIdx>
162 TLoopBody&& loopBody,
163 TIdx
const numIterations,
166 # if _OPENMP < 200805
168 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
170 # pragma omp for nowait schedule(static)
171 for(i = 0; i < iNumBlocksInGrid; ++i)
173 # pragma omp
for nowait schedule(
static)
174 for(TIdx i = 0; i < numIterations; ++i)
178 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
189 template<
typename TKernel>
198 template<
typename TKernel,
typename TSchedule>
209 template<
typename TLoopBody,
typename TIdx>
211 TKernel
const& kernel,
212 TLoopBody&& loopBody,
213 TIdx
const numIterations,
216 # if _OPENMP < 200805
218 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
220 # pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
221 for(i = 0; i < iNumBlocksInGrid; ++i)
223 # pragma omp
for nowait schedule(
static, kernel.ompScheduleChunkSize)
224 for(TIdx i = 0; i < numIterations; ++i)
228 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
241 template<
typename TKernel,
typename TSchedule>
251 template<
typename TKernel>
262 template<
typename TLoopBody,
typename TIdx>
265 TLoopBody&& loopBody,
266 TIdx
const numIterations,
269 # if _OPENMP < 200805
271 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
273 # pragma omp for nowait schedule(dynamic, schedule.chunkSize)
274 for(i = 0; i < iNumBlocksInGrid; ++i)
276 # pragma omp
for nowait schedule(dynamic, schedule.
chunkSize)
277 for(TIdx i = 0; i < numIterations; ++i)
281 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
293 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
303 template<
typename TLoopBody,
typename TIdx>
306 TLoopBody&& loopBody,
307 TIdx
const numIterations,
310 # if _OPENMP < 200805
312 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
314 # pragma omp for nowait schedule(dynamic)
315 for(i = 0; i < iNumBlocksInGrid; ++i)
317 # pragma omp
for nowait schedule(dynamic)
318 for(TIdx i = 0; i < numIterations; ++i)
322 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
334 template<
typename TKernel,
typename TSchedule>
345 template<
typename TLoopBody,
typename TIdx>
347 TKernel
const& kernel,
348 TLoopBody&& loopBody,
349 TIdx
const numIterations,
352 # if _OPENMP < 200805
354 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
356 # pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
357 for(i = 0; i < iNumBlocksInGrid; ++i)
359 # pragma omp
for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
360 for(TIdx i = 0; i < numIterations; ++i)
364 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
377 template<
typename TKernel,
typename TSchedule>
387 template<
typename TKernel>
398 template<
typename TLoopBody,
typename TIdx>
401 TLoopBody&& loopBody,
402 TIdx
const numIterations,
405 # if _OPENMP < 200805
407 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
409 # pragma omp for nowait schedule(guided, schedule.chunkSize)
410 for(i = 0; i < iNumBlocksInGrid; ++i)
412 # pragma omp
for nowait schedule(guided, schedule.
chunkSize)
413 for(TIdx i = 0; i < numIterations; ++i)
417 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
429 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
439 template<
typename TLoopBody,
typename TIdx>
442 TLoopBody&& loopBody,
443 TIdx
const numIterations,
446 # if _OPENMP < 200805
448 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
450 # pragma omp for nowait schedule(guided)
451 for(i = 0; i < iNumBlocksInGrid; ++i)
453 # pragma omp
for nowait schedule(guided)
454 for(TIdx i = 0; i < numIterations; ++i)
458 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
470 template<
typename TKernel,
typename TSchedule>
481 template<
typename TLoopBody,
typename TIdx>
483 TKernel
const& kernel,
484 TLoopBody&& loopBody,
485 TIdx
const numIterations,
488 # if _OPENMP < 200805
490 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
492 # pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
493 for(i = 0; i < iNumBlocksInGrid; ++i)
495 # pragma omp
for nowait schedule(guided, kernel.ompScheduleChunkSize)
496 for(TIdx i = 0; i < numIterations; ++i)
500 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
513 template<
typename TKernel,
typename TSchedule>
518 # if _OPENMP >= 200805
525 template<
typename TKernel,
typename TSchedule>
535 template<
typename TLoopBody,
typename TIdx>
538 TLoopBody&& loopBody,
539 TIdx
const numIterations,
542 # pragma omp for nowait schedule(auto)
543 for(TIdx i = 0; i < numIterations; ++i)
546 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
559 template<
typename TKernel,
typename TSchedule>
569 template<
typename TLoopBody,
typename TIdx>
572 TLoopBody&& loopBody,
573 TIdx
const numIterations,
576 # if _OPENMP < 200805
578 std::intmax_t iNumBlocksInGrid(
static_cast<std::intmax_t
>(numIterations));
580 # pragma omp for nowait schedule(runtime)
581 for(i = 0; i < iNumBlocksInGrid; ++i)
583 # pragma omp
for nowait schedule(runtime)
584 for(TIdx i = 0; i < numIterations; ++i)
588 auto wrappedLoopBody = [&loopBody](
auto idx) { loopBody(idx); };
601 template<
typename TKernel,
typename TSchedule,
typename TSfinae =
void>
613 template<
typename TLoopBody,
typename TIdx>
615 TKernel
const& kernel,
616 TLoopBody&& loopBody,
617 TIdx
const numIterations,
618 TSchedule
const& schedule)
623 std::forward<TLoopBody>(loopBody),
635 template<
typename TKernel>
647 template<
typename TLoopBody,
typename TIdx>
649 TKernel
const& kernel,
650 TLoopBody&& loopBody,
651 TIdx
const numIterations,
655 switch(schedule.
kind)
660 std::forward<TLoopBody>(loopBody),
667 std::forward<TLoopBody>(loopBody),
674 std::forward<TLoopBody>(loopBody),
681 std::forward<TLoopBody>(loopBody),
685 # if _OPENMP >= 200805
689 std::forward<TLoopBody>(loopBody),
697 std::forward<TLoopBody>(loopBody),
708 template<
typename TSchedule>
710 = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
719 template<
typename TKernel,
typename TSchedule>
730 template<
typename TKernel,
typename TSchedule>
742 template<
typename TLoopBody,
typename TIdx>
744 TKernel
const& kernel,
745 TLoopBody&& loopBody,
746 TIdx
const numIterations,
747 TSchedule
const& schedule)
752 std::forward<TLoopBody>(loopBody),
770 template<
typename TKernel,
typename TLoopBody,
typename TIdx,
typename TSchedule>
772 TKernel
const& kernel,
773 TLoopBody&& loopBody,
774 TIdx
const numIterations,
775 TSchedule
const& schedule)
784 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
788 template<
typename TWorkDiv>
791 , m_kernelFnObj(kernelFnObj)
792 , m_args(std::forward<TArgs>(args)...)
795 Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
796 "The work division and the execution task have to be of the same dimensionality!");
804 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*
this);
805 auto const blockThreadExtent = getWorkDiv<Block, Threads>(*
this);
806 auto const threadElemExtent = getWorkDiv<Thread, Elems>(*
this);
809 auto const blockSharedMemDynSizeBytes = std::apply(
810 [&](std::decay_t<TArgs>
const&... args)
812 return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
820 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
821 std::cout << __func__ <<
" blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes <<
" B"
826 TIdx
const numBlocksInGrid(gridBlockExtent.prod());
827 if(blockThreadExtent.prod() !=
static_cast<TIdx
>(1u))
829 throw std::runtime_error(
"Only one thread per block allowed in the OpenMP 2.0 block accelerator!");
833 auto const schedule = std::apply(
834 [&](std::decay_t<TArgs>
const&... args) {
835 return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
843 if(::omp_in_parallel() != 0)
845 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
846 std::cout << __func__ <<
" already within a parallel region." << std::endl;
848 parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
852 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
853 std::cout << __func__ <<
" opening new parallel region." << std::endl;
855 # pragma omp parallel
856 parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
861 template<
typename TSchedule>
863 std::size_t
const& blockSharedMemDynSizeBytes,
864 TIdx
const& numBlocksInGrid,
866 TSchedule
const& schedule)
const ->
void
868 # pragma omp single nowait
874 if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
876 throw std::runtime_error(
"The OpenMP 2.0 runtime did not create a parallel region!");
879 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
880 std::cout << __func__ <<
" omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
884 AccCpuOmp2Blocks<TDim, TIdx> acc(
885 *
static_cast<WorkDivMembers<TDim, TIdx> const*
>(
this),
886 blockSharedMemDynSizeBytes);
890 auto loopBody = [&](
auto currentIndex)
892 # if _OPENMP < 200805
893 auto const i_tidx =
static_cast<TIdx
>(currentIndex);
894 auto const index = Vec<DimInt<1u>, TIdx>(i_tidx);
896 auto const index = Vec<DimInt<1u>, TIdx>(currentIndex);
898 acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
900 std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
909 TKernelFnObj m_kernelFnObj;
910 std::tuple<std::decay_t<TArgs>...> m_args;
916 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
923 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
930 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
937 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
944 template<
typename TDim,
typename TIdx,
typename TKernelFnObj,
typename... TArgs>
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
The CPU OpenMP 2.0 block accelerator.
The CPU OpenMP 2.0 block accelerator execution task.
ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv &&workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
ALPAKA_FN_HOST auto operator()() const -> void
Executes the kernel function object.
A basic class holding the work division as grid block extent, block thread and thread element extent.
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
std::void_t< decltype(TKernel::ompScheduleChunkSize)> HasScheduleChunkSize
Helper type to check if TKernel has member ompScheduleChunkSize.
std::integral_constant< bool, std::is_same< TSchedule, omp::Schedule >::value > IsOmpScheduleTraitSpecialized
Helper type to check if TSchedule is a type originating from OmpSchedule trait definition.
ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
std::enable_if_t< sizeof(TKernel::ompScheduleKind) &&!IsOmpScheduleTraitSpecialized< TSchedule >::value > UseScheduleKind
Helper type to check if member ompScheduleKind of TKernel should be used.
The alpaka accelerator library.
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt &blockSharedMemSt) -> void
Frees all memory used by block shared variables.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with no schedule set.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the dynamic schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the guided schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop with the given schedule.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
Helper executor of parallel OpenMP loop with the static schedule.
ALPAKA_FN_HOST void operator()(TKernel const &, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, omp::Schedule const &schedule)
Run parallel OpenMP loop.
Executor of parallel OpenMP loop.
ALPAKA_FN_HOST void operator()(TKernel const &kernel, TLoopBody &&loopBody, TIdx const numIterations, TSchedule const &schedule)
Run parallel OpenMP loop.
Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless...
int chunkSize
Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a fixed-w...
The accelerator type trait.
The dimension getter type trait.