alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
BufUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2025 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
2 * Bernhard Manfred Gruber, Antonio Di Pilato
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
10#include "alpaka/core/Hip.hpp"
12#include "alpaka/dev/Traits.hpp"
17#include "alpaka/vec/Vec.hpp"
18
19#include <cstddef>
20#include <functional>
21#include <memory>
22#include <type_traits>
23
24#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
25
26namespace alpaka
27{
28 // Forward declarations.
29 struct ApiCudaRt;
30 struct ApiHipRt;
31
32 template<typename TElem, typename TDim, typename TIdx>
33 class BufCpu;
34
35 namespace detail
36 {
37 template<typename TDim, typename SFINAE = void>
39 {
40 explicit PitchHolder(std::size_t)
41 {
42 }
43 };
44
45 template<typename TDim>
46 struct PitchHolder<TDim, std::enable_if_t<TDim::value >= 2>>
47 {
48 std::size_t m_rowPitchInBytes;
49 };
50 } // namespace detail
51
52 //! The CUDA/HIP memory buffer.
53 template<typename TApi, typename TElem, typename TDim, typename TIdx>
56 , internal::ViewAccessOps<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
57 {
58 static_assert(!std::is_const_v<TElem>, "The elem type of the buffer must not be const");
59 static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer must not be const!");
60
61 //! Constructor
62 template<typename TExtent, typename Deleter>
65 TElem* const pMem,
66 Deleter deleter,
67 TExtent const& extent,
68 std::size_t pitchBytes)
69 : detail::PitchHolder<TDim>{pitchBytes}
70 , m_dev(dev)
72 , m_spMem(pMem, std::move(deleter))
73 {
75
76 static_assert(
77 TDim::value == alpaka::Dim<TExtent>::value,
78 "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
79 "identical!");
80 static_assert(
81 std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
82 "The idx type of TExtent and the TIdx template parameter have to be identical!");
83 }
84
87 std::shared_ptr<TElem> m_spMem;
88 };
89
90 namespace trait
91 {
92 //! The BufUniformCudaHipRt device type trait specialization.
93 template<typename TApi, typename TElem, typename TDim, typename TIdx>
94 struct DevType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
95 {
97 };
98
99 //! The BufUniformCudaHipRt device get trait specialization.
100 template<typename TApi, typename TElem, typename TDim, typename TIdx>
101 struct GetDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
102 {
105 {
106 return buf.m_dev;
107 }
108 };
109
110 //! The BufUniformCudaHipRt dimension getter trait specialization.
111 template<typename TApi, typename TElem, typename TDim, typename TIdx>
112 struct DimType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
113 {
114 using type = TDim;
115 };
116
117 //! The BufUniformCudaHipRt memory element type get trait specialization.
118 template<typename TApi, typename TElem, typename TDim, typename TIdx>
119 struct ElemType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
120 {
121 using type = TElem;
122 };
123
124 //! The BufUniformCudaHipRt extent get trait specialization.
125 template<typename TApi, typename TElem, typename TDim, typename TIdx>
126 struct GetExtents<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
127 {
129 {
130 return buffer.m_extentElements;
131 }
132 };
133
134 //! The BufUniformCudaHipRt native pointer get trait specialization.
135 template<typename TApi, typename TElem, typename TDim, typename TIdx>
136 struct GetPtrNative<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
137 {
139 -> TElem const*
140 {
141 return buf.m_spMem.get();
142 }
143
145 {
146 return buf.m_spMem.get();
147 }
148 };
149
150 //! The BufUniformCudaHipRt pointer on device get trait specialization.
151 template<typename TApi, typename TElem, typename TDim, typename TIdx>
152 struct GetPtrDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
153 {
156 DevUniformCudaHipRt<TApi> const& dev) -> TElem const*
157 {
158 if(dev == getDev(buf))
159 {
160 return buf.m_spMem.get();
161 }
162 else
163 {
164 throw std::runtime_error("The buffer is not accessible from the given device!");
165 }
166 }
167
170 DevUniformCudaHipRt<TApi> const& dev) -> TElem*
171 {
172 if(dev == getDev(buf))
173 {
174 return buf.m_spMem.get();
175 }
176 else
177 {
178 throw std::runtime_error("The buffer is not accessible from the given device!");
179 }
180 }
181 };
182
183 template<typename TApi, typename TElem, typename TDim, typename TIdx>
184 struct GetPitchesInBytes<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
185 {
188 {
189 Vec<TDim, TIdx> v{};
190 if constexpr(TDim::value > 0)
191 {
192 v.back() = sizeof(TElem);
193 if constexpr(TDim::value > 1)
194 {
195 v[TDim::value - 2] = static_cast<TIdx>(buf.m_rowPitchInBytes);
196 for(TIdx i = TDim::value - 2; i > 0; i--)
197 v[i - 1] = buf.m_extentElements[i] * v[i];
198 }
199 }
200 return v;
201 }
202 };
203
204 //! The CUDA/HIP memory allocation trait specialization.
205 template<typename TApi, typename TElem, typename Dim, typename TIdx>
206 struct BufAlloc<TElem, Dim, TIdx, DevUniformCudaHipRt<TApi>>
207 {
208 template<typename TExtent>
209 ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt<TApi> const& dev, TExtent const& extent)
211 {
213
214 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
215
216 void* memPtr = nullptr;
217 std::size_t rowPitchInBytes = 0u;
218 if(getExtentProduct(extent) != 0)
219 {
220 if constexpr(Dim::value == 0)
221 {
222 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc(&memPtr, sizeof(TElem)));
223 }
224 else if constexpr(Dim::value == 1)
225 {
227 TApi::malloc(&memPtr, static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem)));
228 }
229 else if constexpr(Dim::value == 2)
230 {
231 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocPitch(
232 &memPtr,
233 &rowPitchInBytes,
234 static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
235 static_cast<std::size_t>(getHeight(extent))));
236 }
237 else if constexpr(Dim::value == 3)
238 {
239 typename TApi::Extent_t const extentVal = TApi::makeExtent(
240 static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
241 static_cast<std::size_t>(getHeight(extent)),
242 static_cast<std::size_t>(getDepth(extent)));
243 typename TApi::PitchedPtr_t pitchedPtrVal;
244 pitchedPtrVal.ptr = nullptr;
245 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc3D(&pitchedPtrVal, extentVal));
246 memPtr = pitchedPtrVal.ptr;
247 rowPitchInBytes = pitchedPtrVal.pitch;
248 }
249 }
250# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
251 std::cout << __func__;
252 if constexpr(Dim::value >= 1)
253 std::cout << " ew: " << getWidth(extent);
254 if constexpr(Dim::value >= 2)
255 std::cout << " eh: " << getHeight(extent);
256 if constexpr(Dim::value >= 3)
257 std::cout << " ed: " << getDepth(extent);
258 std::cout << " ptr: " << memPtr;
259 if constexpr(Dim::value >= 2)
260 std::cout << " rowpitch: " << rowPitchInBytes;
261 std::cout << std::endl;
262# endif
263 return {
264 dev,
265 reinterpret_cast<TElem*>(memPtr),
266 [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::free(ptr)); },
267 extent,
268 rowPitchInBytes};
269 }
270 };
271
272 //! The CUDA/HIP stream-ordered memory allocation trait specialization.
273 template<typename TApi, typename TElem, typename TDim, typename TIdx>
274 struct AsyncBufAlloc<TElem, TDim, TIdx, DevUniformCudaHipRt<TApi>>
275 {
276 static_assert(
277 TDim::value <= 1,
278 "CUDA/HIP devices support only one-dimensional stream-ordered memory buffers.");
279
280 template<typename TQueue, typename TExtent>
281 ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, [[maybe_unused]] TExtent const& extent)
283 {
285
286 static_assert(TDim::value == Dim<TExtent>::value, "extent must have the same dimension as the buffer");
287 auto const width = getExtentProduct(extent); // handles 1D and 0D buffers
288
289 auto const& dev = getDev(queue);
290 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
291 void* memPtr = nullptr;
292 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocAsync(
293 &memPtr,
294 static_cast<std::size_t>(width) * sizeof(TElem),
295 queue.getNativeHandle()));
296
297# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
298 std::cout << __func__ << " ew: " << width << " ptr: " << memPtr << std::endl;
299# endif
300 return {
301 dev,
302 reinterpret_cast<TElem*>(memPtr),
303 [q = std::move(queue)](TElem* ptr)
304 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, q.getNativeHandle())); },
305 extent,
306 static_cast<std::size_t>(width) * sizeof(TElem)};
307 }
308 };
309
310 //! The CUDA/HIP stream-ordered memory allocation capability trait specialization.
311 template<typename TApi, typename TDim>
312 struct HasAsyncBufSupport<TDim, DevUniformCudaHipRt<TApi>> : std::bool_constant<TDim::value <= 1>
313 {
314 };
315
316 //! The pinned/mapped memory allocation trait specialization for the CUDA/HIP devices.
317 template<typename TApi, typename TElem, typename TDim, typename TIdx>
318 struct BufAllocMapped<PlatformUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
319 {
320 template<typename TExtent>
321 ALPAKA_FN_HOST static auto allocMappedBuf(
322 DevCpu const& host,
323 PlatformUniformCudaHipRt<TApi> const& /*platform*/,
324 TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
325 {
327
328 // Allocate CUDA/HIP page-locked memory on the host, mapped into the CUDA/HIP address space and
329 // accessible to all CUDA/HIP devices.
330 TElem* memPtr = nullptr;
332 reinterpret_cast<void**>(&memPtr),
333 sizeof(TElem) * static_cast<std::size_t>(getExtentProduct(extent)),
334 TApi::hostMallocMapped | TApi::hostMallocPortable));
335 auto deleter = [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::hostFree(ptr)); };
336
337 return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
338 }
339 };
340
341 //! The pinned/mapped memory allocation capability trait specialization.
342 template<typename TApi>
343 struct HasMappedBufSupport<PlatformUniformCudaHipRt<TApi>> : public std::true_type
344 {
345 };
346
347 //! The BufUniformCudaHipRt offset get trait specialization.
348 template<typename TApi, typename TElem, typename TDim, typename TIdx>
349 struct GetOffsets<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
350 {
356 };
357
358 //! The BufUniformCudaHipRt idx type trait specialization.
359 template<typename TApi, typename TElem, typename TDim, typename TIdx>
360 struct IdxType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
361 {
362 using type = TIdx;
363 };
364
365 //! The BufCpu pointer on CUDA/HIP device get trait specialization.
366 template<typename TApi, typename TElem, typename TDim, typename TIdx>
367 struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
368 {
370 BufCpu<TElem, TDim, TIdx> const& buf,
371 DevUniformCudaHipRt<TApi> const&) -> TElem const*
372 {
373 // TODO: Check if the memory is mapped at all!
374 TElem* pDev(nullptr);
375
376 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(
377 &pDev,
378 const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
379 0));
380
381 return pDev;
382 }
383
385 -> TElem*
386 {
387 // TODO: Check if the memory is mapped at all!
388 TElem* pDev(nullptr);
389
390 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(&pDev, getPtrNative(buf), 0));
391
392 return pDev;
393 }
394 };
395 } // namespace trait
396} // namespace alpaka
397
400
401#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CPU memory buffer.
Definition BufCpu.hpp:90
The CUDA/HIP RT device handle.
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_FN_HOST_ACC constexpr auto back() -> TVal &
Definition Vec.hpp:141
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtentProduct(T const &object) -> Idx< T >
Definition Traits.hpp:134
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getHeight(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:108
ALPAKA_FN_HOST auto getPtrNative(TView const &view) -> Elem< TView > const *
Gets the native pointer of the memory view.
Definition Traits.hpp:136
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T > >
Definition Traits.hpp:59
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getDepth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:121
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWidth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:95
STL namespace.
The CUDA/HIP memory buffer.
DevUniformCudaHipRt< TApi > m_dev
ALPAKA_FN_HOST BufUniformCudaHipRt(DevUniformCudaHipRt< TApi > const &dev, TElem *const pMem, Deleter deleter, TExtent const &extent, std::size_t pitchBytes)
Constructor.
std::shared_ptr< TElem > m_spMem
static ALPAKA_FN_HOST auto allocAsyncBuf(TQueue queue, TExtent const &extent) -> BufUniformCudaHipRt< TApi, TElem, TDim, TIdx >
The stream-ordered memory allocator trait.
Definition Traits.hpp:31
static ALPAKA_FN_HOST auto allocBuf(DevUniformCudaHipRt< TApi > const &dev, TExtent const &extent) -> BufUniformCudaHipRt< TApi, TElem, Dim, TIdx >
The memory allocator trait.
Definition Traits.hpp:27
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
The element type trait.
Definition Traits.hpp:16
static ALPAKA_FN_HOST auto getDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition Traits.hpp:27
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buffer) const
The GetExtents trait for getting the extents of an object as an alpaka::Vec.
Definition Traits.hpp:37
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &) const -> Vec< TDim, TIdx >
The GetOffsets trait for getting the offsets of an object as an alpaka::Vec.
Definition Traits.hpp:33
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) const -> Vec< TDim, TIdx >
Customization point for getPitchesInBytes. The default implementation uses the extent to calculate th...
Definition Traits.hpp:103
static ALPAKA_FN_HOST auto getPtrDev(BufCpu< TElem, TDim, TIdx > const &buf, DevUniformCudaHipRt< TApi > const &) -> TElem const *
static ALPAKA_FN_HOST auto getPtrDev(BufCpu< TElem, TDim, TIdx > &buf, DevUniformCudaHipRt< TApi > const &) -> TElem *
static ALPAKA_FN_HOST auto getPtrDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf, DevUniformCudaHipRt< TApi > const &dev) -> TElem const *
static ALPAKA_FN_HOST auto getPtrDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &buf, DevUniformCudaHipRt< TApi > const &dev) -> TElem *
The pointer on device get trait.
Definition Traits.hpp:58
static ALPAKA_FN_HOST auto getPtrNative(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &buf) -> TElem *
static ALPAKA_FN_HOST auto getPtrNative(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> TElem const *
The native pointer get trait.
Definition Traits.hpp:54
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostMalloc(reinterpret_cast< void ** >(&memPtr), sizeof(TElem) *static_cast< std::size_t >(getExtentProduct(extent)), TApi::hostMallocMapped|TApi::hostMallocPortable))
The stream-ordered memory allocation capability trait.
Definition Traits.hpp:36
The pinned/mapped memory allocation capability trait.
Definition Traits.hpp:46
The idx type trait.
Definition Traits.hpp:25