alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
BufUniformCudaHipRtTraits.hpp
Go to the documentation of this file.
1/* Copyright 2025 Anton Reinhard
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11
12#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
13
14namespace alpaka::trait
15{
16 //! The CUDA/HIP RT device memory buffer type trait specialization.
17 template<typename TApi, typename TElem, typename TDim, typename TIdx>
18 struct BufType<DevUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
19 {
21 };
22
23 //! The BufUniformCudaHipRt device type trait specialization.
24 template<typename TApi, typename TElem, typename TDim, typename TIdx>
25 struct DevType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
26 {
28 };
29
30 //! The BufUniformCudaHipRt device get trait specialization.
31 template<typename TApi, typename TElem, typename TDim, typename TIdx>
32 struct GetDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
33 {
36 {
37 return buf.m_spBufImpl->m_dev;
38 }
39 };
40
41 //! The BufUniformCudaHipRt dimension getter trait.
42 template<typename TApi, typename TElem, typename TDim, typename TIdx>
43 struct DimType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
44 {
45 using type = TDim;
46 };
47
48 //! The BufUniformCudaHipRt memory element type get trait specialization.
49 template<typename TApi, typename TElem, typename TDim, typename TIdx>
50 struct ElemType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
51 {
52 using type = TElem;
53 };
54
55 //! The BufUniformCudaHipRt width get trait specialization.
56 template<typename TApi, typename TElem, typename TDim, typename TIdx>
57 struct GetExtents<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
58 {
60 {
61 return buf.m_spBufImpl->m_extentElements;
62 }
63 };
64
65 //! The BufUniformCudaHipRt native pointer get trait specialization.
66 template<typename TApi, typename TElem, typename TDim, typename TIdx>
67 struct GetPtrNative<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
68 {
70 -> TElem const*
71 {
72 return buf.m_spBufImpl->m_pMem;
73 }
74
76 {
77 return buf.m_spBufImpl->m_pMem;
78 }
79 };
80
81 //! The BufUniformCudaHipRt pointer on device get trait specialization.
82 template<typename TApi, typename TElem, typename TDim, typename TIdx>
83 struct GetPtrDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
84 {
87 DevUniformCudaHipRt<TApi> const& dev) -> TElem const*
88 {
89 if(dev == getDev(buf))
90 {
91 return buf.m_spBufImpl->m_pMem;
92 }
93 else
94 {
95 throw std::runtime_error("The buffer is not accessible from the given device!");
96 }
97 }
98
101 DevUniformCudaHipRt<TApi> const& dev) -> TElem*
102 {
103 if(dev == getDev(buf))
104 {
105 return buf.m_spBufImpl->m_pMem;
106 }
107 else
108 {
109 throw std::runtime_error("The buffer is not accessible from the given device!");
110 }
111 }
112 };
113
114 template<typename TApi, typename TElem, typename TDim, typename TIdx>
124
125 //! The BufUniformCudaHipRt offset get trait specialization.
126 template<typename TApi, typename TElem, typename TDim, typename TIdx>
127 struct GetOffsets<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
128 {
134 };
135
136 //! The BufUniformCudaHipRt idx type trait specialization.
137 template<typename TApi, typename TElem, typename TDim, typename TIdx>
138 struct IdxType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
139 {
140 using type = TIdx;
141 };
142
143 //! The BufCpu pointer on CUDA/HIP device get trait specialization.
144 template<typename TApi, typename TElem, typename TDim, typename TIdx>
145 struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
146 {
148 -> TElem const*
149 {
150 // TODO: Check if the memory is mapped at all!
151 TElem* pDev(nullptr);
152
153 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(
154 &pDev,
155 const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
156 0));
157
158 return pDev;
159 }
160
162 -> TElem*
163 {
164 // TODO: Check if the memory is mapped at all!
165 TElem* pDev(nullptr);
166
167 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(&pDev, getPtrNative(buf), 0));
168
169 return pDev;
170 }
171 };
172
173 //! The MakeConstBuf trait for CUDA/HIP buffers.
174 template<typename TApi, typename TElem, typename TDim, typename TIdx>
189
190 //! The CUDA/HIP memory allocation trait specialization.
191 template<typename TApi, typename TElem, typename Dim, typename TIdx>
192 struct BufAlloc<TElem, Dim, TIdx, DevUniformCudaHipRt<TApi>>
193 {
194 template<typename TExtent>
195 ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt<TApi> const& dev, TExtent const& extent)
197 {
199
200 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
201
202 void* memPtr = nullptr;
203 std::size_t rowPitchInBytes = 0u;
204 if(getExtentProduct(extent) != 0)
205 {
206 if constexpr(Dim::value == 0)
207 {
208 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc(&memPtr, sizeof(TElem)));
209 }
210 else if constexpr(Dim::value == 1)
211 {
213 TApi::malloc(&memPtr, static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem)));
214 }
215 else if constexpr(Dim::value == 2)
216 {
217 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocPitch(
218 &memPtr,
219 &rowPitchInBytes,
220 static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
221 static_cast<std::size_t>(getHeight(extent))));
222 }
223 else if constexpr(Dim::value == 3)
224 {
225 typename TApi::Extent_t const extentVal = TApi::makeExtent(
226 static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
227 static_cast<std::size_t>(getHeight(extent)),
228 static_cast<std::size_t>(getDepth(extent)));
229 typename TApi::PitchedPtr_t pitchedPtrVal;
230 pitchedPtrVal.ptr = nullptr;
231 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc3D(&pitchedPtrVal, extentVal));
232 memPtr = pitchedPtrVal.ptr;
233 rowPitchInBytes = pitchedPtrVal.pitch;
234 }
235 }
236# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
237 std::cout << __func__;
238 if constexpr(Dim::value >= 1)
239 std::cout << " ew: " << getWidth(extent);
240 if constexpr(Dim::value >= 2)
241 std::cout << " eh: " << getHeight(extent);
242 if constexpr(Dim::value >= 3)
243 std::cout << " ed: " << getDepth(extent);
244 std::cout << " ptr: " << memPtr;
245 if constexpr(Dim::value >= 2)
246 std::cout << " rowpitch: " << rowPitchInBytes;
247 std::cout << std::endl;
248# endif
249 return {
250 dev,
251 reinterpret_cast<TElem*>(memPtr),
252 [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::free(ptr)); },
253 extent,
254 rowPitchInBytes};
255 }
256 };
257
258 //! The CUDA/HIP stream-ordered memory allocation capability trait specialization.
259 template<typename TApi, typename TDim>
260 struct HasAsyncBufSupport<TDim, DevUniformCudaHipRt<TApi>> : std::true_type
261 {
262 };
263
264 //! The CUDA/HIP stream-ordered memory allocation trait specialization.
265 template<typename TApi, typename TElem, typename TDim, typename TIdx>
266 struct AsyncBufAlloc<TElem, TDim, TIdx, DevUniformCudaHipRt<TApi>>
267 {
268 template<typename TQueue>
269 ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, [[maybe_unused]] Vec<TDim, TIdx> const& extent)
271 {
273
274 std::size_t bytes, pitch;
275 if constexpr(TDim::value == 0)
276 {
277 bytes = pitch = sizeof(TElem);
278 }
279 else if constexpr(TDim::value == 1)
280 {
281 bytes = pitch = static_cast<std::size_t>(extent.back()) * sizeof(TElem);
282 }
283 else
284 {
285 std::size_t const width = static_cast<std::size_t>(extent.back()) * sizeof(TElem);
286 // On all tested NVIDIA and AMD GPUs the alignment used for pitched allocations is the same value
287 // reported by the textureAlignment device property (512 bytes on NVIDA GPUs, 256 bytes on AMD GPUs).
288 // This was tested on: NVIDIA Tesla T4, A100, L40S, H100, and RTX 3050 Ti Laptop GPUs,
289 // and on AMD Radeon Pro WX 9100, Radeon Pro W7800/W7900, Instinct MI250X, and Instinct MI300X.
290 // However, it is expected that an alignment of 128 bytes (32 threads per warp times 4 bytes per float
291 // or int) should be sufficient to achieve coalesced memory accesses, and would reduce the amount of
292 // wasted memory.
293 constexpr std::size_t alignment = 128;
294 pitch = (width + alignment - 1) / alignment * alignment;
295 // Replace the last entry in the extent vector (i.e. the number of elements per row) with the pitch
296 // (the number of bytes per row, including padding), and compute the total size in bytes, removing
297 // the padding after the last row.
298 auto aligned = alpaka::castVec<std::size_t>(extent);
299 aligned.back() = pitch;
300 bytes = aligned.prod() - pitch + width;
301 }
302
303 auto const& dev = getDev(queue);
304 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
305 void* memPtr = nullptr;
306 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocAsync(&memPtr, bytes, queue.getNativeHandle()));
307
308# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
309 std::cout << __func__;
310 if constexpr(Dim::value >= 1)
311 std::cout << " ew: " << getWidth(extent);
312 if constexpr(Dim::value >= 2)
313 std::cout << " eh: " << getHeight(extent);
314 if constexpr(Dim::value >= 3)
315 std::cout << " ed: " << getDepth(extent);
316 std::cout << " ptr: " << memPtr;
317 if constexpr(Dim::value >= 2)
318 std::cout << " rowpitch: " << pitch;
319 std::cout << std::endl;
320# endif
321 return {
322 dev,
323 reinterpret_cast<TElem*>(memPtr),
324 [q = std::move(queue)](TElem* ptr)
325 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, q.getNativeHandle())); },
326 extent,
327 pitch};
328 }
329 };
330
331 //! The pinned/mapped memory allocation capability trait specialization.
332 template<typename TApi>
333 struct HasMappedBufSupport<PlatformUniformCudaHipRt<TApi>> : public std::true_type
334 {
335 };
336
337 //! The pinned/mapped memory allocation trait specialization for the CUDA/HIP devices.
338 template<typename TApi, typename TElem, typename TDim, typename TIdx>
339 struct BufAllocMapped<PlatformUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
340 {
341 template<typename TExtent>
343 DevCpu const& host,
344 PlatformUniformCudaHipRt<TApi> const& /*platform*/,
345 TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
346 {
348
349 // Allocate CUDA/HIP page-locked memory on the host, mapped into the CUDA/HIP address space and
350 // accessible to all CUDA/HIP devices.
351 TElem* memPtr = nullptr;
352 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostMalloc(
353 reinterpret_cast<void**>(&memPtr),
354 sizeof(TElem) * static_cast<std::size_t>(getExtentProduct(extent)),
355 TApi::hostMallocMapped | TApi::hostMallocPortable));
356 auto deleter = [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::hostFree(ptr)); };
357
358 return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
359 }
360 };
361
362} // namespace alpaka::trait
363
364#endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CPU memory buffer template implementing muting accessors.
Definition BufCpu.hpp:24
The generic memory buffer template implementing muting accessors.
std::shared_ptr< TBufImpl > m_spBufImpl
The CPU device handle.
Definition DevCpu.hpp:56
The CUDA/HIP RT device handle.
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto zeros() -> Vec< TDim, TVal >
Zero value constructor.
Definition Vec.hpp:99
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The accelerator traits.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtentProduct(T const &object) -> Idx< T >
Definition Traits.hpp:134
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getHeight(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:108
ALPAKA_FN_HOST auto getPtrNative(TView const &view) -> Elem< TView > const *
Gets the native pointer of the memory view.
Definition Traits.hpp:165
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getDepth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:121
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWidth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:95
static ALPAKA_FN_HOST auto allocAsyncBuf(TQueue queue, Vec< TDim, TIdx > const &extent) -> BufUniformCudaHipRt< TApi, TElem, TDim, TIdx >
The stream-ordered memory allocator trait.
Definition Traits.hpp:35
static ALPAKA_FN_HOST auto allocMappedBuf(DevCpu const &host, PlatformUniformCudaHipRt< TApi > const &, TExtent const &extent) -> BufCpu< TElem, TDim, TIdx >
The pinned/mapped memory allocator trait.
Definition Traits.hpp:45
static ALPAKA_FN_HOST auto allocBuf(DevUniformCudaHipRt< TApi > const &dev, TExtent const &extent) -> BufUniformCudaHipRt< TApi, TElem, Dim, TIdx >
The memory allocator trait.
Definition Traits.hpp:31
The memory buffer type trait.
Definition Traits.hpp:23
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
The element type trait.
Definition Traits.hpp:16
static ALPAKA_FN_HOST auto getDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition Traits.hpp:27
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf)
The GetExtents trait for getting the extents of an object as an alpaka::Vec.
Definition Traits.hpp:37
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &) const -> Vec< TDim, TIdx >
The GetOffsets trait for getting the offsets of an object as an alpaka::Vec.
Definition Traits.hpp:33
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) const -> Vec< TDim, TIdx >
Customization point for getPitchesInBytes. The default implementation uses the extent to calculate th...
Definition Traits.hpp:129
static ALPAKA_FN_HOST auto getPtrDev(BufCpu< TElem, TDim, TIdx > const &buf, DevUniformCudaHipRt< TApi > const &) -> TElem const *
static ALPAKA_FN_HOST auto getPtrDev(BufCpu< TElem, TDim, TIdx > &buf, DevUniformCudaHipRt< TApi > const &) -> TElem *
static ALPAKA_FN_HOST auto getPtrDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf, DevUniformCudaHipRt< TApi > const &dev) -> TElem const *
static ALPAKA_FN_HOST auto getPtrDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &buf, DevUniformCudaHipRt< TApi > const &dev) -> TElem *
The pointer on device get trait.
Definition Traits.hpp:84
static ALPAKA_FN_HOST auto getPtrNative(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &buf) -> TElem *
static ALPAKA_FN_HOST auto getPtrNative(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> TElem const *
The native pointer get trait.
Definition Traits.hpp:80
The stream-ordered memory allocation capability trait.
Definition Traits.hpp:40
The pinned/mapped memory allocation capability trait.
Definition Traits.hpp:50
The idx type trait.
Definition Traits.hpp:25
static ALPAKA_FN_HOST auto makeConstBuf(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &&buf) -> ConstBufUniformCudaHipRt< TApi, TElem, TDim, TIdx >
static ALPAKA_FN_HOST auto makeConstBuf(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> ConstBufUniformCudaHipRt< TApi, TElem, TDim, TIdx >
The trait to transform a mutable buffer into a constant one.
Definition Traits.hpp:55