alpaka/BlockSyncUniformCudaHipBuiltIn_8hpp_source.html

/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber

 * SPDX-License-Identifier: MPL-2.0

 */


#pragma once


#include "alpaka/block/sync/Traits.hpp"

#include "alpaka/core/BoostPredef.hpp"

#include "alpaka/core/Interface.hpp"


#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)


namespace alpaka

{

    //! The GPU CUDA/HIP block synchronization.


    class BlockSyncUniformCudaHipBuiltIn

        : public interface::Implements<ConceptBlockSync, BlockSyncUniformCudaHipBuiltIn>

    {

    };


#    if !defined(ALPAKA_HOST_ONLY)


#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA

#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!

#        endif


#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP

#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!

#        endif


    namespace trait

    {

        template<>

        struct SyncBlockThreads<BlockSyncUniformCudaHipBuiltIn>

        {

            __device__ static auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/) -> void

            {

                __syncthreads();

            }

        };


        template<>

        struct SyncBlockThreadsPredicate<BlockCount, BlockSyncUniformCudaHipBuiltIn>

        {

            __device__ static auto syncBlockThreadsPredicate(

                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,

                int predicate) -> int

            {

#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP

                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension

                __shared__ int tmp;

                __syncthreads();

                if(threadIdx.x == 0)

                    tmp = 0;

                __syncthreads();

                if(predicate)

                    ::atomicAdd(&tmp, 1);

                __syncthreads();


                return tmp;

#        else

                return __syncthreads_count(predicate);

#        endif

            }

        };


        template<>

        struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncUniformCudaHipBuiltIn>

        {

            __device__ static auto syncBlockThreadsPredicate(

                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,

                int predicate) -> int

            {

#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP

                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension

                __shared__ int tmp;

                __syncthreads();

                if(threadIdx.x == 0)

                    tmp = 1;

                __syncthreads();

                if(!predicate)

                    ::atomicAnd(&tmp, 0);

                __syncthreads();


                return tmp;

#        else

                return __syncthreads_and(predicate);

#        endif

            }

        };


        template<>

        struct SyncBlockThreadsPredicate<BlockOr, BlockSyncUniformCudaHipBuiltIn>

        {

            __device__ static auto syncBlockThreadsPredicate(

                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,

                int predicate) -> int

            {

#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP

                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension

                __shared__ int tmp;

                __syncthreads();

                if(threadIdx.x == 0)

                    tmp = 0;

                __syncthreads();

                if(predicate)

                    ::atomicOr(&tmp, 1);

                __syncthreads();


                return tmp;

#        else

                return __syncthreads_or(predicate);

#        endif

            }

        };

    } // namespace trait


#    endif


} // namespace alpaka


#endif

BoostPredef.hpp

Interface.hpp

Traits.hpp

alpaka::BlockSyncUniformCudaHipBuiltIn
The GPU CUDA/HIP block synchronization.
Definition BlockSyncUniformCudaHipBuiltIn.hpp:18

alpaka
The alpaka accelerator library.
Definition AccCpuOmp2Blocks.hpp:49

alpaka::syncBlockThreads
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const &blockSync) -> void
Synchronizes all threads within the current block (independently for all blocks).
Definition Traits.hpp:36

alpaka::atomicAdd
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAdd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic add operation.
Definition Traits.hpp:114

alpaka::syncBlockThreadsPredicate
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC auto syncBlockThreadsPredicate(TBlockSync const &blockSync, int predicate) -> int
Synchronizes all threads within the current block (independently for all blocks), evaluates the predi...
Definition Traits.hpp:100

alpaka::atomicOr
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicOr(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic or operation.
Definition Traits.hpp:258

alpaka::atomicAnd
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto atomicAnd(TAtomic const &atomic, T *const addr, T const &value, THierarchy const &hier=THierarchy()) -> T
Executes an atomic and operation.
Definition Traits.hpp:240

alpaka::interface::Implements
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15