alpaka/EventHostManualTrigger_8hpp_source.html

/* Copyright 2024 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,

 *                Bernhard Manfred Gruber, Aurora Perego

 * SPDX-License-Identifier: MPL-2.0

 */


#pragma once


#include "alpaka/alpaka.hpp"


#include <condition_variable>

#include <mutex>

#include <utility>


namespace alpaka::test

{


    namespace trait

    {

        template<typename TDev>

        struct EventHostManualTriggerType;


        template<typename TDev>

        struct IsEventHostManualTriggerSupported;

    } // namespace trait


    //! The event host manual trigger type trait alias template to remove the ::type.

    template<typename TDev>

    using EventHostManualTrigger = typename trait::EventHostManualTriggerType<TDev>::type;


    template<typename TDev>


    ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(TDev const& dev) -> bool

    {

        return trait::IsEventHostManualTriggerSupported<TDev>::isSupported(dev);

    }


    namespace cpu::detail

    {

        //! Event that can be enqueued into a queue and can be triggered by the Host.

        template<class TDev = DevCpu>


        class EventHostManualTriggerCpuImpl

        {

        public:

            //! Constructor.


            ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(TDev dev) noexcept

                : m_dev(std::move(dev))

                , m_mutex()

                , m_enqueueCount(0u)

                , m_bIsReady(true)

            {

            }


            EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const& other) = delete;

            auto operator=(EventHostManualTriggerCpuImpl const&) -> EventHostManualTriggerCpuImpl& = delete;


            void trigger()

            {

                {

                    std::unique_lock<std::mutex> lock(m_mutex);

                    m_bIsReady = true;

                }

                m_conditionVariable.notify_one();

                // Give alpaka time to update into the new state, process all events and tasks.

                std::this_thread::sleep_for(std::chrono::milliseconds(200u));

            }


        public:

            TDev const m_dev; //!< The device this event is bound to.


            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.


            mutable std::condition_variable m_conditionVariable; //!< The condition signaling the event completion.

            std::size_t m_enqueueCount; //!< The number of times this event has been enqueued.


            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already

                             //!< completed).

        };


    } // namespace cpu::detail


    //! Event that can be enqueued into a queue and can be triggered by the Host.

    template<class TDev = DevCpu>


    class EventHostManualTriggerCpu

    {

    public:

        //! Constructor.


        ALPAKA_FN_HOST EventHostManualTriggerCpu(TDev const& dev)

            : m_spEventImpl(std::make_shared<cpu::detail::EventHostManualTriggerCpuImpl<TDev>>(dev))

        {

        }


        //! Equality comparison operator.


        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const& rhs) const -> bool

        {

            return (m_spEventImpl == rhs.m_spEventImpl);

        }


        //! Inequality comparison operator.


        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const& rhs) const -> bool

        {

            return !((*this) == rhs);

        }


        void trigger()

        {

            m_spEventImpl->trigger();

            // Give alpaka time to update into the new state, process all events and tasks.

            std::this_thread::sleep_for(std::chrono::milliseconds(200u));

        }


    public:

        std::shared_ptr<cpu::detail::EventHostManualTriggerCpuImpl<TDev>> m_spEventImpl;

    };


    namespace trait

    {

        template<>


        struct EventHostManualTriggerType<DevCpu>

        {

            using type = test::EventHostManualTriggerCpu<DevCpu>;

        };


        //! The CPU event host manual trigger support get trait specialization.

        template<>


        struct IsEventHostManualTriggerSupported<DevCpu>

        {


            ALPAKA_FN_HOST static auto isSupported(DevCpu const&) -> bool

            {

                return true;

            }


        };


    } // namespace trait

} // namespace alpaka::test


namespace alpaka::trait

{

    //! The CPU device event device get trait specialization.

    template<typename TDev>


    struct GetDev<test::EventHostManualTriggerCpu<TDev>>

    {

        //


        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCpu<TDev> const& event) -> TDev

        {

            return event.m_spEventImpl->m_dev;

        }


    };


    //! The CPU device event test trait specialization.

    template<typename TDev>


    struct IsComplete<test::EventHostManualTriggerCpu<TDev>>

    {

        //! \return If the event is not waiting within a queue (not enqueued or already handled).


        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCpu<TDev> const& event) -> bool

        {

            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);


            return event.m_spEventImpl->m_bIsReady;

        }


    };


    template<typename TDev>


    struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>

    {

        //


        ALPAKA_FN_HOST static auto enqueue(

            QueueGenericThreadsNonBlocking<TDev>& queue,

            test::EventHostManualTriggerCpu<TDev>& event) -> void

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.

            auto spEventImpl = event.m_spEventImpl;


            // Setting the event state and enqueuing it has to be atomic.

            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);


            // The event should not yet be enqueued.

            ALPAKA_ASSERT(spEventImpl->m_bIsReady);


            // Set its state to enqueued.

            spEventImpl->m_bIsReady = false;


            // Increment the enqueue counter. This is used to skip waits for events that had already been finished

            // and re-enqueued which would lead to deadlocks.

            ++spEventImpl->m_enqueueCount;


            auto const enqueueCount = spEventImpl->m_enqueueCount;


            // Enqueue a task that only resets the events flag if it is completed.

            queue.m_spQueueImpl->m_workerThread.submit(

                [spEventImpl, enqueueCount]() mutable

                {

                    std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);

                    spEventImpl->m_conditionVariable.wait(

                        lk2,

                        [spEventImpl, enqueueCount]

                        { return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady; });

                });

        }


    };


    template<typename TDev>


    struct Enqueue<QueueGenericThreadsBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>

    {

        //


        ALPAKA_FN_HOST static auto enqueue(

            QueueGenericThreadsBlocking<TDev>&,

            test::EventHostManualTriggerCpu<TDev>& event) -> void

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.

            auto spEventImpl = event.m_spEventImpl;


            // Setting the event state and enqueuing it has to be atomic.

            std::unique_lock<std::mutex> lk(spEventImpl->m_mutex);


            // The event should not yet be enqueued.

            ALPAKA_ASSERT(spEventImpl->m_bIsReady);


            // Set its state to enqueued.

            spEventImpl->m_bIsReady = false;


            // Increment the enqueue counter. This is used to skip waits for events that had already been finished

            // and re-enqueued which would lead to deadlocks.

            ++spEventImpl->m_enqueueCount;


            auto const enqueueCount = spEventImpl->m_enqueueCount;


            spEventImpl->m_conditionVariable.wait(

                lk,

                [spEventImpl, enqueueCount]

                { return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady; });

        }


    };


} // namespace alpaka::trait


#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED


#    include "alpaka/core/Config.hpp"


#    include <cuda.h>


#    if !ALPAKA_LANG_CUDA && !defined(ALPAKA_HOST_ONLY)

#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!

#    endif


#    include "alpaka/core/Cuda.hpp"


namespace alpaka::test

{


    namespace uniform_cuda_hip::detail

    {


        class EventHostManualTriggerCudaImpl final

        {

            using TApi = alpaka::ApiCudaRt;


        public:


            ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(DevCudaRt const& dev)

                : m_dev(dev)

                , m_mutex()

                , m_bIsReady(true)

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Set the current device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.getNativeHandle()));

                // Allocate the buffer on this device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));

                // Initiate the memory set.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(

                    cudaMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));

            }


            EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const&) = delete;

            auto operator=(EventHostManualTriggerCudaImpl const&) -> EventHostManualTriggerCudaImpl& = delete;


            ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Free the buffer.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cudaFree(m_devMem));

            }


            void trigger()

            {

                std::unique_lock<std::mutex> lock(m_mutex);

                m_bIsReady = true;


                // Set the current device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.getNativeHandle()));

                // Initiate the memory set.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(

                    cudaMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));

                // Give alpaka time to update into the new state, process all events and tasks.

                std::this_thread::sleep_for(std::chrono::milliseconds(200u));

            }


        public:

            DevCudaRt const m_dev; //!< The device this event is bound to.


            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.

            void* m_devMem;


            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already

                             //!< completed).

        };


    } // namespace uniform_cuda_hip::detail


    class EventHostManualTriggerCuda final

    {

    public:


        ALPAKA_FN_HOST EventHostManualTriggerCuda(DevCudaRt const& dev)

            : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl>(dev))

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

        }


        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const& rhs) const -> bool

        {

            return (m_spEventImpl == rhs.m_spEventImpl);

        }


        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const& rhs) const -> bool

        {

            return !((*this) == rhs);

        }


        void trigger()

        {

            m_spEventImpl->trigger();

            // Give alpaka time to update into the new state, process all events and tasks.

            std::this_thread::sleep_for(std::chrono::milliseconds(200u));

        }


    public:

        std::shared_ptr<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl> m_spEventImpl;

    };


    namespace trait

    {

        template<>


        struct EventHostManualTriggerType<DevCudaRt>

        {

            using type = test::EventHostManualTriggerCuda;

        };


        //! The CPU event host manual trigger support get trait specialization.

        template<>


        struct IsEventHostManualTriggerSupported<DevCudaRt>

        {


            ALPAKA_FN_HOST static auto isSupported([[maybe_unused]] DevCudaRt const& dev) -> bool

            {

#    if CUDA_VERSION < 11070

                int result = 0;

                cuDeviceGetAttribute(&result, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev.getNativeHandle());

                return result != 0;

#    else

                return true; // Always enabled as of CUDA 11.7

#    endif

            }


        };


    } // namespace trait

} // namespace alpaka::test


namespace alpaka::trait

{

    namespace detail

    {

        // TODO: Replace with cuStreamWaitValue32 once support for CUDA < 12 is dropped.


        inline auto streamWaitValue(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags)

            -> CUresult

        {

            // NVIDIA introduced a new stream memory ops API with CUDA 11.7 (called v2). The corresponding CUDA

            // functions were suffixed with `_v2`. With CUDA 12.0 v1 of the API was removed and the `_v2` removed

            // from the new functions. So CUDA <= 11.6 and CUDA >= 12.0 share the same function signature but

            // internally do different things.

#    if(CUDA_VERSION < 11070) || (CUDA_VERSION >= 12000)

            return cuStreamWaitValue32(stream, addr, value, flags);

#    else

            return cuStreamWaitValue32_v2(stream, addr, value, flags);

#    endif

        }


    } // namespace detail


    //! The CPU device event device get trait specialization.

    template<>


    struct GetDev<test::EventHostManualTriggerCuda>

    {


        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCuda const& event) -> DevCudaRt

        {

            return event.m_spEventImpl->m_dev;

        }


    };


    //! The CPU device event test trait specialization.

    template<>


    struct IsComplete<test::EventHostManualTriggerCuda>

    {

        //! \return If the event is not waiting within a queue (not enqueued or already handled).


        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCuda const& event) -> bool

        {

            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);


            return event.m_spEventImpl->m_bIsReady;

        }


    };


    template<>


    struct Enqueue<QueueCudaRtNonBlocking, test::EventHostManualTriggerCuda>

    {


        ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, test::EventHostManualTriggerCuda& event)

            -> void

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.

            auto spEventImpl(event.m_spEventImpl);


            // Setting the event state and enqueuing it has to be atomic.

            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);


            // The event should not yet be enqueued.

            ALPAKA_ASSERT(spEventImpl->m_bIsReady);


            // Set its state to enqueued.

            spEventImpl->m_bIsReady = false;


            // PGI Profiler`s User Guide:

            // The following are known issues related to Events and Metrics:

            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting

            //   on host updates may hang. This includes synchronization between the host and

            //   the device build upon value-based CUDA queue synchronization APIs such as

            //   cuStreamWaitValue32() and cuStreamWriteValue32().

            ALPAKA_CUDA_DRV_CHECK(detail::streamWaitValue(

                static_cast<CUstream>(queue.getNativeHandle()),

                reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),

                0x0101'0101u,

                CU_STREAM_WAIT_VALUE_GEQ));

        }


    };


    template<>


    struct Enqueue<QueueCudaRtBlocking, test::EventHostManualTriggerCuda>

    {


        ALPAKA_FN_HOST static auto enqueue(QueueCudaRtBlocking& queue, test::EventHostManualTriggerCuda& event) -> void

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.

            auto spEventImpl(event.m_spEventImpl);


            // Setting the event state and enqueuing it has to be atomic.

            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);


            // The event should not yet be enqueued.

            ALPAKA_ASSERT(spEventImpl->m_bIsReady);


            // Set its state to enqueued.

            spEventImpl->m_bIsReady = false;


            // PGI Profiler`s User Guide:

            // The following are known issues related to Events and Metrics:

            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting

            //   on host updates may hang. This includes synchronization between the host and

            //   the device build upon value-based CUDA queue synchronization APIs such as

            //   cuStreamWaitValue32() and cuStreamWriteValue32().

            ALPAKA_CUDA_DRV_CHECK(detail::streamWaitValue(

                static_cast<CUstream>(queue.getNativeHandle()),

                reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),

                0x0101'0101u,

                CU_STREAM_WAIT_VALUE_GEQ));

        }


    };


} // namespace alpaka::trait

#endif


#ifdef ALPAKA_ACC_GPU_HIP_ENABLED


#    include <hip/hip_runtime.h>


#    if !ALPAKA_LANG_HIP && !defined(ALPAKA_HOST_ONLY)

#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!

#    endif


#    include "alpaka/core/Hip.hpp"


namespace alpaka::test

{

    namespace hip::detail

    {

        class EventHostManualTriggerHipImpl final

        {

            using TApi = alpaka::ApiHipRt;


        public:

            ALPAKA_FN_HOST EventHostManualTriggerHipImpl(DevHipRt const& dev) : m_dev(dev), m_mutex(), m_bIsReady(true)

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Set the current device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.getNativeHandle()));

                // Allocate the buffer on this device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));

                // Initiate the memory set.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(

                    hipMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));

            }


            EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl const&) = delete;

            auto operator=(EventHostManualTriggerHipImpl const&) -> EventHostManualTriggerHipImpl& = delete;


            ALPAKA_FN_HOST ~EventHostManualTriggerHipImpl()

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Free the buffer.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(hipFree(m_devMem));

            }


            void trigger()

            {

                std::unique_lock<std::mutex> lock(m_mutex);

                m_bIsReady = true;


                // Set the current device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.getNativeHandle()));

                // Initiate the memory set.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(

                    hipMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));

                // Give alpaka time to update into the new state, process all events and tasks.

                std::this_thread::sleep_for(std::chrono::milliseconds(200u));

            }


        public:

            DevHipRt const m_dev; //!< The device this event is bound to.


            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.

            void* m_devMem;


            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already

                             //!< completed).

        };

    } // namespace hip::detail


    class EventHostManualTriggerHip final

    {

    public:

        ALPAKA_FN_HOST EventHostManualTriggerHip(DevHipRt const& dev)

            : m_spEventImpl(std::make_shared<hip::detail::EventHostManualTriggerHipImpl>(dev))

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

        }


        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerHip const& rhs) const -> bool

        {

            return (m_spEventImpl == rhs.m_spEventImpl);

        }


        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerHip const& rhs) const -> bool

        {

            return !((*this) == rhs);

        }


        void trigger()

        {

            m_spEventImpl->trigger();

            // Give alpaka time to update into the new state, process all events and tasks.

            std::this_thread::sleep_for(std::chrono::milliseconds(200u));

        }


    public:

        std::shared_ptr<hip::detail::EventHostManualTriggerHipImpl> m_spEventImpl;

    };


    namespace trait

    {

        template<>

        struct EventHostManualTriggerType<DevHipRt>

        {

            using type = test::EventHostManualTriggerHip;

        };


        //! The HIP event host manual trigger support get trait specialization.

        template<>

        struct IsEventHostManualTriggerSupported<DevHipRt>

        {

            // TODO: there is no CUDA_VERSION in the HIP compiler path.

            // TODO: there is a hipDeviceGetAttribute, but there is no pendant for

            // CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.

            ALPAKA_FN_HOST static auto isSupported(DevHipRt const&) -> bool

            {

                return false;

            }

        };

    } // namespace trait

} // namespace alpaka::test


namespace alpaka::trait

{

    //! The CPU device event device get trait specialization.

    template<>

    struct GetDev<test::EventHostManualTriggerHip>

    {

        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerHip const& event) -> DevHipRt

        {

            return event.m_spEventImpl->m_dev;

        }

    };


    //! The CPU device event test trait specialization.

    template<>

    struct IsComplete<test::EventHostManualTriggerHip>

    {

        //! \return If the event is not waiting within a queue (not enqueued or already handled).

        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerHip const& event) -> bool

        {

            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);


            return event.m_spEventImpl->m_bIsReady;

        }

    };


    template<>

    struct Enqueue<QueueHipRtNonBlocking, test::EventHostManualTriggerHip>

    {

        using TApi = alpaka::ApiHipRt;


        ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, test::EventHostManualTriggerHip& event)

            -> void

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.

            auto spEventImpl(event.m_spEventImpl);


            // Setting the event state and enqueuing it has to be atomic.

            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);


            // The event should not yet be enqueued.

            ALPAKA_ASSERT(spEventImpl->m_bIsReady);


            // Set its state to enqueued.

            spEventImpl->m_bIsReady = false;


            // PGI Profiler`s User Guide:

            // The following are known issues related to Events and Metrics:

            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting

            //   on host updates may hang. This includes synchronization between the host and

            //   the device build upon value-based CUDA queue synchronization APIs such as

            //   cuStreamWaitValue32() and cuStreamWriteValue32().

            int32_t hostMem = 0;

#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL

            std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not "

                         "available.\n";

#    endif

            while(hostMem < 0x0101'0101)

            {

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMemcpyDtoHAsync(

                    &hostMem,

                    reinterpret_cast<hipDeviceptr_t>(event.m_spEventImpl->m_devMem),

                    sizeof(int32_t),

                    queue.getNativeHandle()));

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamSynchronize(queue.getNativeHandle()));

            }

        }

    };


    template<>

    struct Enqueue<QueueHipRtBlocking, test::EventHostManualTriggerHip>

    {

        using TApi = alpaka::ApiHipRt;


        ALPAKA_FN_HOST static auto enqueue(QueueHipRtBlocking& /* queue */, test::EventHostManualTriggerHip& event)

            -> void

        {

            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.

            auto spEventImpl(event.m_spEventImpl);


            // Setting the event state and enqueuing it has to be atomic.

            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);


            // The event should not yet be enqueued.

            ALPAKA_ASSERT(spEventImpl->m_bIsReady);


            // Set its state to enqueued.

            spEventImpl->m_bIsReady = false;


            // PGI Profiler`s User Guide:

            // The following are known issues related to Events and Metrics:

            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting

            //   on host updates may hang. This includes synchronization between the host and

            //   the device build upon value-based HIP queue synchronization APIs such as

            //   cuStreamWaitValue32() and cuStreamWriteValue32().


            // workaround for missing cuStreamWaitValue32 in HIP

            std::uint32_t hmem = 0;

            do

            {

                std::this_thread::sleep_for(std::chrono::milliseconds(10u));

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(

                    hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault));

            } while(hmem < 0x0101'0101u);

        }

    };

} // namespace alpaka::trait

#endif


#ifdef ALPAKA_ACC_SYCL_ENABLED

namespace alpaka

{

    namespace test

    {

        template<concepts::Tag TTag>

        class EventHostManualTriggerSycl

        {

        public:

            EventHostManualTriggerSycl(DevGenericSycl<TTag> const&)

            {

            }


            auto trigger()

            {

            }

        };


        namespace trait

        {

            template<concepts::Tag TTag>

            struct EventHostManualTriggerType<DevGenericSycl<TTag>>

            {

                using type = alpaka::test::EventHostManualTriggerSycl<TTag>;

            };


            template<concepts::Tag TTag>

            struct IsEventHostManualTriggerSupported<DevGenericSycl<TTag>>

            {

                ALPAKA_FN_HOST static auto isSupported(DevGenericSycl<TTag> const&) -> bool

                {

                    return false;

                }

            };

        } // namespace trait

    } // namespace test


    namespace trait

    {

        template<concepts::Tag TTag>

        struct Enqueue<QueueGenericSyclBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>

        {

            ALPAKA_FN_HOST static auto enqueue(

                QueueGenericSyclBlocking<TTag>& /* queue */,

                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void

            {

            }

        };


        template<concepts::Tag TTag>

        struct Enqueue<QueueGenericSyclNonBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>

        {

            ALPAKA_FN_HOST static auto enqueue(

                QueueGenericSyclNonBlocking<TTag>& /* queue */,

                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void

            {

            }

        };


        template<concepts::Tag TTag>

        struct IsComplete<test::EventHostManualTriggerSycl<TTag>>

        {

            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerSycl<TTag> const& /* event */) -> bool

            {

                return true;

            }

        };

    } // namespace trait

} // namespace alpaka

#endif

ALPAKA_ASSERT
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition Assert.hpp:13

Config.hpp

Cuda.hpp

ALPAKA_CUDA_DRV_CHECK
#define ALPAKA_CUDA_DRV_CHECK(cmd)
CUDA driver error checking with log and exception.
Definition Cuda.hpp:54

ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55

Hip.hpp

ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
Definition UniformCudaHip.hpp:112

ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
Definition UniformCudaHip.hpp:105

alpaka.hpp

alpaka::DevCpu
The CPU device handle.
Definition DevCpu.hpp:56

alpaka::DevUniformCudaHipRt
The CUDA/HIP RT device handle.
Definition DevUniformCudaHipRt.hpp:62

alpaka::DevUniformCudaHipRt::getNativeHandle
auto getNativeHandle() const noexcept -> int
Definition DevUniformCudaHipRt.hpp:83

alpaka::QueueGenericThreadsBlocking
The CPU device queue.
Definition QueueGenericThreadsBlocking.hpp:74

alpaka::QueueGenericThreadsNonBlocking
The CPU device queue.
Definition QueueGenericThreadsNonBlocking.hpp:83

alpaka::test::EventHostManualTriggerCpu
Event that can be enqueued into a queue and can be triggered by the Host.
Definition EventHostManualTrigger.hpp:81

alpaka::test::EventHostManualTriggerCpu::trigger
void trigger()
Definition EventHostManualTrigger.hpp:101

alpaka::test::EventHostManualTriggerCpu::operator!=
ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const &rhs) const -> bool
Inequality comparison operator.
Definition EventHostManualTrigger.hpp:96

alpaka::test::EventHostManualTriggerCpu::operator==
ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const &rhs) const -> bool
Equality comparison operator.
Definition EventHostManualTrigger.hpp:90

alpaka::test::EventHostManualTriggerCpu::m_spEventImpl
std::shared_ptr< cpu::detail::EventHostManualTriggerCpuImpl< TDev > > m_spEventImpl
Definition EventHostManualTrigger.hpp:109

alpaka::test::EventHostManualTriggerCpu::EventHostManualTriggerCpu
ALPAKA_FN_HOST EventHostManualTriggerCpu(TDev const &dev)
Constructor.
Definition EventHostManualTrigger.hpp:84

alpaka::test::EventHostManualTriggerCuda
Definition EventHostManualTrigger.hpp:309

alpaka::test::EventHostManualTriggerCuda::operator==
ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const &rhs) const -> bool
Definition EventHostManualTrigger.hpp:317

alpaka::test::EventHostManualTriggerCuda::EventHostManualTriggerCuda
ALPAKA_FN_HOST EventHostManualTriggerCuda(DevCudaRt const &dev)
Definition EventHostManualTrigger.hpp:311

alpaka::test::EventHostManualTriggerCuda::operator!=
ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const &rhs) const -> bool
Definition EventHostManualTrigger.hpp:322

alpaka::test::EventHostManualTriggerCuda::trigger
void trigger()
Definition EventHostManualTrigger.hpp:327

alpaka::test::EventHostManualTriggerCuda::m_spEventImpl
std::shared_ptr< uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl > m_spEventImpl
Definition EventHostManualTrigger.hpp:335

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl
Event that can be enqueued into a queue and can be triggered by the Host.
Definition EventHostManualTrigger.hpp:40

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::operator=
auto operator=(EventHostManualTriggerCpuImpl const &) -> EventHostManualTriggerCpuImpl &=delete

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::m_dev
TDev const m_dev
The device this event is bound to.
Definition EventHostManualTrigger.hpp:66

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::trigger
void trigger()
Definition EventHostManualTrigger.hpp:54

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::EventHostManualTriggerCpuImpl
EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const &other)=delete

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::m_bIsReady
bool m_bIsReady
If the event is not waiting within a queue (not enqueued or already completed).
Definition EventHostManualTrigger.hpp:73

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::m_enqueueCount
std::size_t m_enqueueCount
The number of times this event has been enqueued.
Definition EventHostManualTrigger.hpp:71

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::m_mutex
std::mutex m_mutex
The mutex used to synchronize access to the event.
Definition EventHostManualTrigger.hpp:68

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::EventHostManualTriggerCpuImpl
ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(TDev dev) noexcept
Constructor.
Definition EventHostManualTrigger.hpp:43

alpaka::test::cpu::detail::EventHostManualTriggerCpuImpl::m_conditionVariable
std::condition_variable m_conditionVariable
The condition signaling the event completion.
Definition EventHostManualTrigger.hpp:70

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl
Definition EventHostManualTrigger.hpp:252

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::trigger
void trigger()
Definition EventHostManualTrigger.hpp:283

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::m_devMem
void * m_devMem
Definition EventHostManualTrigger.hpp:301

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::~EventHostManualTriggerCudaImpl
ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()
Definition EventHostManualTrigger.hpp:275

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::m_bIsReady
bool m_bIsReady
If the event is not waiting within a queue (not enqueued or already completed).
Definition EventHostManualTrigger.hpp:303

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::m_mutex
std::mutex m_mutex
The mutex used to synchronize access to the event.
Definition EventHostManualTrigger.hpp:300

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::operator=
auto operator=(EventHostManualTriggerCudaImpl const &) -> EventHostManualTriggerCudaImpl &=delete

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::m_dev
DevCudaRt const m_dev
The device this event is bound to.
Definition EventHostManualTrigger.hpp:298

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::EventHostManualTriggerCudaImpl
EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const &)=delete

alpaka::test::uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl::EventHostManualTriggerCudaImpl
ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(DevCudaRt const &dev)
Definition EventHostManualTrigger.hpp:256

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt
The CUDA/HIP RT queue.
Definition QueueUniformCudaHipRt.hpp:115

ALPAKA_FN_HOST
#define ALPAKA_FN_HOST
Definition Common.hpp:40

alpaka::internal::operator==
constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex< T > const &lhs, Complex< T > const &rhs)
Equality of two complex numbers.
Definition Complex.hpp:294

alpaka::internal::operator!=
constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex< T > const &lhs, Complex< T > const &rhs)
Inequality of two complex numbers.
Definition Complex.hpp:320

alpaka::test
The test specifics.
Definition TestAccs.hpp:27

alpaka::test::isEventHostManualTriggerSupported
ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(TDev const &dev) -> bool
Definition EventHostManualTrigger.hpp:30

alpaka::test::EventHostManualTrigger
typename trait::EventHostManualTriggerType< TDev >::type EventHostManualTrigger
The event host manual trigger type trait alias template to remove the ::type.
Definition EventHostManualTrigger.hpp:27

alpaka::trait::detail::streamWaitValue
auto streamWaitValue(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) -> CUresult
Definition EventHostManualTrigger.hpp:369

alpaka::trait
The accelerator traits.
Definition AccCpuOmp2Blocks.hpp:115

alpaka
The alpaka accelerator library.
Definition AccCpuOmp2Blocks.hpp:52

alpaka::isComplete
ALPAKA_FN_HOST auto isComplete(TEvent const &event) -> bool
Tests if the given event has already been completed.
Definition Traits.hpp:34

alpaka::getDev
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68

alpaka::enqueue
ALPAKA_FN_HOST auto enqueue(TQueue &queue, TTask &&task) -> void
Queues the given task in the given queue.
Definition Traits.hpp:47

std
STL namespace.

alpaka::ApiCudaRt
Definition ApiCudaRt.hpp:16

alpaka::test::trait::EventHostManualTriggerType
Definition EventHostManualTrigger.hpp:19

alpaka::test::trait::IsEventHostManualTriggerSupported< DevCpu >::isSupported
static ALPAKA_FN_HOST auto isSupported(DevCpu const &) -> bool
Definition EventHostManualTrigger.hpp:124

alpaka::test::trait::IsEventHostManualTriggerSupported< DevCudaRt >::isSupported
static ALPAKA_FN_HOST auto isSupported(DevCudaRt const &dev) -> bool
Definition EventHostManualTrigger.hpp:350

alpaka::test::trait::IsEventHostManualTriggerSupported
Definition EventHostManualTrigger.hpp:22

alpaka::trait::Enqueue< QueueCudaRtBlocking, test::EventHostManualTriggerCuda >::enqueue
static ALPAKA_FN_HOST auto enqueue(QueueCudaRtBlocking &queue, test::EventHostManualTriggerCuda &event) -> void
Definition EventHostManualTrigger.hpp:444

alpaka::trait::Enqueue< QueueCudaRtNonBlocking, test::EventHostManualTriggerCuda >::enqueue
static ALPAKA_FN_HOST auto enqueue(QueueCudaRtNonBlocking &queue, test::EventHostManualTriggerCuda &event) -> void
Definition EventHostManualTrigger.hpp:410

alpaka::trait::Enqueue< QueueGenericThreadsBlocking< TDev >, test::EventHostManualTriggerCpu< TDev > >::enqueue
static ALPAKA_FN_HOST auto enqueue(QueueGenericThreadsBlocking< TDev > &, test::EventHostManualTriggerCpu< TDev > &event) -> void
Definition EventHostManualTrigger.hpp:203

alpaka::trait::Enqueue< QueueGenericThreadsNonBlocking< TDev >, test::EventHostManualTriggerCpu< TDev > >::enqueue
static ALPAKA_FN_HOST auto enqueue(QueueGenericThreadsNonBlocking< TDev > &queue, test::EventHostManualTriggerCpu< TDev > &event) -> void
Definition EventHostManualTrigger.hpp:162

alpaka::trait::Enqueue
The queue enqueue trait.
Definition Traits.hpp:27

alpaka::trait::GetDev< test::EventHostManualTriggerCpu< TDev > >::getDev
static ALPAKA_FN_HOST auto getDev(test::EventHostManualTriggerCpu< TDev > const &event) -> TDev
Definition EventHostManualTrigger.hpp:139

alpaka::trait::GetDev< test::EventHostManualTriggerCuda >::getDev
static ALPAKA_FN_HOST auto getDev(test::EventHostManualTriggerCuda const &event) -> DevCudaRt
Definition EventHostManualTrigger.hpp:388

alpaka::trait::GetDev
The device get trait.
Definition Traits.hpp:27

alpaka::trait::IsComplete< test::EventHostManualTriggerCpu< TDev > >::isComplete
static ALPAKA_FN_HOST auto isComplete(test::EventHostManualTriggerCpu< TDev > const &event) -> bool
Definition EventHostManualTrigger.hpp:150

alpaka::trait::IsComplete< test::EventHostManualTriggerCuda >::isComplete
static ALPAKA_FN_HOST auto isComplete(test::EventHostManualTriggerCuda const &event) -> bool
Definition EventHostManualTrigger.hpp:399

alpaka::trait::IsComplete
The event tester trait.
Definition Traits.hpp:21