alpaka/QueueUniformCudaHipRt_8hpp_source.html

/* Copyright 2022 Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,

 * Antonio Di Pilato

 * SPDX-License-Identifier: MPL-2.0

 */


#pragma once


#include "alpaka/core/CallbackThread.hpp"

#include "alpaka/core/Cuda.hpp"

#include "alpaka/core/Hip.hpp"

#include "alpaka/core/Interface.hpp"

#include "alpaka/dev/Traits.hpp"

#include "alpaka/event/Traits.hpp"

#include "alpaka/meta/DependentFalseType.hpp"

#include "alpaka/queue/Traits.hpp"

#include "alpaka/traits/Traits.hpp"

#include "alpaka/wait/Traits.hpp"


#include <condition_variable>

#include <functional>

#include <future>

#include <memory>

#include <mutex>

#include <thread>


#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)


namespace alpaka

{

    template<typename TApi>

    class EventUniformCudaHipRt;


    template<typename TApi>

    class DevUniformCudaHipRt;


    namespace uniform_cuda_hip::detail

    {

        //! The CUDA/HIP RT queue implementation.

        template<typename TApi>


        class QueueUniformCudaHipRtImpl final

        {

        public:


            ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt<TApi> const& dev)

                : m_dev(dev)

                , m_UniformCudaHipQueue()

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Set the current device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));


                // - [cuda/hip]StreamDefault: Default queue creation flag.

                // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run

                // concurrently with work in queue 0 (the NULL queue),

                //   and that the created queue should perform no implicit synchronization with queue 0.

                // Create the queue on the current device.

                // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka

                // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.


                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(

                    TApi::streamCreateWithFlags(&m_UniformCudaHipQueue, TApi::streamNonBlocking));

            }


            QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl&&) = default;

            auto operator=(QueueUniformCudaHipRtImpl&&) -> QueueUniformCudaHipRtImpl& = delete;


            ALPAKA_FN_HOST ~QueueUniformCudaHipRtImpl()

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Make sure all pending async work is finished before destroying the stream to guarantee determinism.

                // This would not be necessary for plain CUDA/HIP operations, but we can have host functions in the

                // stream, which reference this queue instance and its CallbackThread. Make sure they are done.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamSynchronize(m_UniformCudaHipQueue));

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamDestroy(m_UniformCudaHipQueue));

            }


            [[nodiscard]] auto getNativeHandle() const noexcept

            {

                return m_UniformCudaHipQueue;

            }


        public:

            DevUniformCudaHipRt<TApi> const m_dev; //!< The device this queue is bound to.

            core::CallbackThread m_callbackThread;


        private:

            typename TApi::Stream_t m_UniformCudaHipQueue;

        };


        //! The CUDA/HIP RT queue.

        template<typename TApi, bool TBlocking>


        class QueueUniformCudaHipRt

            : public interface::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRt<TApi, TBlocking>>

            , public interface::Implements<ConceptQueue, QueueUniformCudaHipRt<TApi, TBlocking>>

            , public interface::Implements<ConceptGetDev, QueueUniformCudaHipRt<TApi, TBlocking>>

        {

        public:


            ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt<TApi> const& dev)

                : m_spQueueImpl(std::make_shared<QueueUniformCudaHipRtImpl<TApi>>(dev))

            {

                dev.registerQueue(m_spQueueImpl);

            }


            ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const& rhs) const -> bool

            {

                return (m_spQueueImpl == rhs.m_spQueueImpl);

            }


            ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const& rhs) const -> bool

            {

                return !((*this) == rhs);

            }


            [[nodiscard]] auto getNativeHandle() const noexcept

            {

                return m_spQueueImpl->getNativeHandle();

            }


            auto getCallbackThread() -> core::CallbackThread&

            {

                return m_spQueueImpl->m_callbackThread;

            }


        public:

            std::shared_ptr<QueueUniformCudaHipRtImpl<TApi>> m_spQueueImpl;

        };


    } // namespace uniform_cuda_hip::detail


    namespace trait

    {

        //! The CUDA/HIP RT queue device get trait specialization.

        template<typename TApi, bool TBlocking>


        struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>

        {


            ALPAKA_FN_HOST static auto getDev(

                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue)

                -> DevUniformCudaHipRt<TApi>

            {

                return queue.m_spQueueImpl->m_dev;

            }


        };


        //! The CUDA/HIP RT queue test trait specialization.

        template<typename TApi, bool TBlocking>


        struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>

        {


            ALPAKA_FN_HOST static auto empty(

                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue) -> bool

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Query is allowed even for queues on non current device.

                typename TApi::Error_t ret = TApi::success;

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(

                    ret = TApi::streamQuery(queue.getNativeHandle()),

                    TApi::errorNotReady);

                return (ret == TApi::success);

            }


        };


        //! The CUDA/HIP RT queue thread wait trait specialization.

        //!

        //! Blocks execution of the calling thread until the queue has finished processing all previously requested

        //! tasks (kernels, data copies, ...)

        template<typename TApi, bool TBlocking>


        struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>

        {


            ALPAKA_FN_HOST static auto currentThreadWaitFor(

                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue) -> void

            {

                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;


                // Sync is allowed even for queues on non current device.

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));

            }


        };


        //! The CUDA/HIP RT blocking queue device type trait specialization.

        template<typename TApi, bool TBlocking>


        struct DevType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>

        {

            using type = DevUniformCudaHipRt<TApi>;

        };


        //! The CUDA/HIP RT blocking queue event type trait specialization.

        template<typename TApi, bool TBlocking>


        struct EventType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>

        {

            using type = EventUniformCudaHipRt<TApi>;

        };


        //! The CUDA/HIP RT blocking queue enqueue trait specialization.

        template<typename TApi, bool TBlocking, typename TTask>


        struct Enqueue<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>, TTask>

        {

            using QueueImpl = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;


            struct HostFuncData

            {

                // We don't need to keep the queue alive, because in it's dtor it will synchronize with the CUDA/HIP

                // stream and wait until all host functions and the CallbackThread are done. It's actually an error to

                // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host

                // function. Passing it further to the Callback thread, would make the Callback thread hold a task

                // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance

                // exists will make the CallbackThread join itself and crash.

                QueueImpl& q;

                TTask t;

            };


            ALPAKA_FN_HOST static void uniformCudaHipRtHostFunc(void* arg)

            {

                auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));

                auto& queue = data->q;

                auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); });

                f.wait();

            }


            ALPAKA_FN_HOST static auto enqueue(

                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,

                TTask const& task) -> void

            {

                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::launchHostFunc(

                    queue.getNativeHandle(),

                    uniformCudaHipRtHostFunc,

                    new HostFuncData{*queue.m_spQueueImpl, task}));

                if constexpr(TBlocking)

                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));

            }


        };


        //! The CUDA/HIP RT blocking queue native handle trait specialization.

        template<typename TApi, bool TBlocking>


        struct NativeHandle<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>

        {


            [[nodiscard]] static auto getNativeHandle(

                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue)

            {

                return queue.getNativeHandle();

            }


        };


    } // namespace trait

} // namespace alpaka


#endif

CallbackThread.hpp

Cuda.hpp

ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55

DependentFalseType.hpp

Hip.hpp

Interface.hpp

ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
Definition UniformCudaHip.hpp:112

ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd,...)
CUDA/HIP runtime error checking with log and exception, ignoring specific error values.
Definition UniformCudaHip.hpp:101

ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
Definition UniformCudaHip.hpp:105

alpaka::DevUniformCudaHipRt
The CUDA/HIP RT device handle.
Definition DevUniformCudaHipRt.hpp:62

alpaka::DevUniformCudaHipRt::registerQueue
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr< IDeviceQueue > spQueue) const -> void
Registers the given queue on this device. NOTE: Every queue has to be registered for correct function...
Definition DevUniformCudaHipRt.hpp:95

alpaka::EventUniformCudaHipRt
The CUDA/HIP RT device event.
Definition EventUniformCudaHipRt.hpp:87

alpaka::core::CallbackThread
Definition CallbackThread.hpp:21

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl
The CUDA/HIP RT queue implementation.
Definition QueueUniformCudaHipRt.hpp:41

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::getNativeHandle
auto getNativeHandle() const noexcept
Definition QueueUniformCudaHipRt.hpp:78

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::operator=
auto operator=(QueueUniformCudaHipRtImpl &&) -> QueueUniformCudaHipRtImpl &=delete

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::QueueUniformCudaHipRtImpl
QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl &&)=default

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::QueueUniformCudaHipRtImpl
ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt< TApi > const &dev)
Definition QueueUniformCudaHipRt.hpp:43

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::m_callbackThread
core::CallbackThread m_callbackThread
Definition QueueUniformCudaHipRt.hpp:85

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::m_dev
DevUniformCudaHipRt< TApi > const m_dev
The device this queue is bound to.
Definition QueueUniformCudaHipRt.hpp:84

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl::~QueueUniformCudaHipRtImpl
ALPAKA_FN_HOST ~QueueUniformCudaHipRtImpl()
Definition QueueUniformCudaHipRt.hpp:67

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt
The CUDA/HIP RT queue.
Definition QueueUniformCudaHipRt.hpp:97

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt::getCallbackThread
auto getCallbackThread() -> core::CallbackThread &
Definition QueueUniformCudaHipRt.hpp:120

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt::QueueUniformCudaHipRt
ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt< TApi > const &dev)
Definition QueueUniformCudaHipRt.hpp:99

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt::operator!=
ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const &rhs) const -> bool
Definition QueueUniformCudaHipRt.hpp:110

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt::operator==
ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const &rhs) const -> bool
Definition QueueUniformCudaHipRt.hpp:105

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt::getNativeHandle
auto getNativeHandle() const noexcept
Definition QueueUniformCudaHipRt.hpp:115

alpaka::uniform_cuda_hip::detail::QueueUniformCudaHipRt::m_spQueueImpl
std::shared_ptr< QueueUniformCudaHipRtImpl< TApi > > m_spQueueImpl
Definition QueueUniformCudaHipRt.hpp:126

ALPAKA_FN_HOST
#define ALPAKA_FN_HOST
Definition Common.hpp:40

Traits.hpp

Traits.hpp

alpaka
The alpaka accelerator library.
Definition AccCpuOmp2Blocks.hpp:52

std
STL namespace.

Traits.hpp

alpaka::interface::Implements
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15

alpaka::trait::CurrentThreadWaitFor< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > >::currentThreadWaitFor
static ALPAKA_FN_HOST auto currentThreadWaitFor(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> void
Definition QueueUniformCudaHipRt.hpp:169

alpaka::trait::CurrentThreadWaitFor
The thread wait trait.
Definition Traits.hpp:21

alpaka::trait::DevType
The device type trait.
Definition Traits.hpp:23

alpaka::trait::Empty< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > >::empty
static ALPAKA_FN_HOST auto empty(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> bool
Definition QueueUniformCudaHipRt.hpp:148

alpaka::trait::Empty
The queue empty trait.
Definition Traits.hpp:31

alpaka::trait::Enqueue< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking >, TTask >::enqueue
static ALPAKA_FN_HOST auto enqueue(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > &queue, TTask const &task) -> void
Definition QueueUniformCudaHipRt.hpp:219

alpaka::trait::Enqueue< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking >, TTask >::uniformCudaHipRtHostFunc
static ALPAKA_FN_HOST void uniformCudaHipRtHostFunc(void *arg)
Definition QueueUniformCudaHipRt.hpp:211

alpaka::trait::Enqueue< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking >, TTask >::HostFuncData::t
TTask t
Definition QueueUniformCudaHipRt.hpp:208

alpaka::trait::Enqueue< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking >, TTask >::HostFuncData::q
QueueImpl & q
Definition QueueUniformCudaHipRt.hpp:207

alpaka::trait::Enqueue
The queue enqueue trait.
Definition Traits.hpp:27

alpaka::trait::EventType
The event type trait.
Definition Traits.hpp:17

alpaka::trait::GetDev< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > >::getDev
static ALPAKA_FN_HOST auto getDev(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue) -> DevUniformCudaHipRt< TApi >
Definition QueueUniformCudaHipRt.hpp:136

alpaka::trait::GetDev
The device get trait.
Definition Traits.hpp:27

alpaka::trait::NativeHandle< uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > >::getNativeHandle
static auto getNativeHandle(uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, TBlocking > const &queue)
Definition QueueUniformCudaHipRt.hpp:236

alpaka::trait::NativeHandle
The native handle trait.
Definition Traits.hpp:17

Traits.hpp

Traits.hpp