alpaka
Abstraction Library for Parallel Kernel Acceleration
BufUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Alexander Matthes, Benjamin Worpitz, Matthias Werner, RenĂ© Widera, Andrea Bocci, Jan Stephan,
2  * Bernhard Manfred Gruber, Antonio Di Pilato
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
8 #include "alpaka/core/Assert.hpp"
9 #include "alpaka/core/Cuda.hpp"
10 #include "alpaka/core/Hip.hpp"
12 #include "alpaka/dev/Traits.hpp"
17 #include "alpaka/vec/Vec.hpp"
18 
19 #include <cstddef>
20 #include <functional>
21 #include <memory>
22 #include <type_traits>
23 
24 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
25 
26 namespace alpaka
27 {
28  // Forward declarations.
29  struct ApiCudaRt;
30  struct ApiHipRt;
31 
32  template<typename TElem, typename TDim, typename TIdx>
33  class BufCpu;
34 
35  namespace detail
36  {
37  template<typename TDim, typename SFINAE = void>
38  struct PitchHolder
39  {
40  explicit PitchHolder(std::size_t)
41  {
42  }
43  };
44 
45  template<typename TDim>
46  struct PitchHolder<TDim, std::enable_if_t<TDim::value >= 2>>
47  {
48  std::size_t m_rowPitchInBytes;
49  };
50  } // namespace detail
51 
52  //! The CUDA/HIP memory buffer.
53  template<typename TApi, typename TElem, typename TDim, typename TIdx>
55  : detail::PitchHolder<TDim>
56  , internal::ViewAccessOps<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
57  {
58  static_assert(!std::is_const_v<TElem>, "The elem type of the buffer must not be const");
59  static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer must not be const!");
60 
61  //! Constructor
62  template<typename TExtent, typename Deleter>
64  DevUniformCudaHipRt<TApi> const& dev,
65  TElem* const pMem,
66  Deleter deleter,
67  TExtent const& extent,
68  std::size_t pitchBytes)
69  : detail::PitchHolder<TDim>{pitchBytes}
70  , m_dev(dev)
71  , m_extentElements(getExtents(extent))
72  , m_spMem(pMem, std::move(deleter))
73  {
75 
76  static_assert(
77  TDim::value == alpaka::Dim<TExtent>::value,
78  "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
79  "identical!");
80  static_assert(
81  std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
82  "The idx type of TExtent and the TIdx template parameter have to be identical!");
83  }
84 
87  std::shared_ptr<TElem> m_spMem;
88  };
89 
90  namespace trait
91  {
92  //! The BufUniformCudaHipRt device type trait specialization.
93  template<typename TApi, typename TElem, typename TDim, typename TIdx>
94  struct DevType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
95  {
97  };
98 
99  //! The BufUniformCudaHipRt device get trait specialization.
100  template<typename TApi, typename TElem, typename TDim, typename TIdx>
101  struct GetDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
102  {
105  {
106  return buf.m_dev;
107  }
108  };
109 
110  //! The BufUniformCudaHipRt dimension getter trait specialization.
111  template<typename TApi, typename TElem, typename TDim, typename TIdx>
112  struct DimType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
113  {
114  using type = TDim;
115  };
116 
117  //! The BufUniformCudaHipRt memory element type get trait specialization.
118  template<typename TApi, typename TElem, typename TDim, typename TIdx>
119  struct ElemType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
120  {
121  using type = TElem;
122  };
123 
124  //! The BufUniformCudaHipRt extent get trait specialization.
125  template<typename TApi, typename TElem, typename TDim, typename TIdx>
126  struct GetExtents<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
127  {
129  {
130  return buffer.m_extentElements;
131  }
132  };
133 
134  //! The BufUniformCudaHipRt native pointer get trait specialization.
135  template<typename TApi, typename TElem, typename TDim, typename TIdx>
136  struct GetPtrNative<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
137  {
139  -> TElem const*
140  {
141  return buf.m_spMem.get();
142  }
143 
145  {
146  return buf.m_spMem.get();
147  }
148  };
149 
150  //! The BufUniformCudaHipRt pointer on device get trait specialization.
151  template<typename TApi, typename TElem, typename TDim, typename TIdx>
152  struct GetPtrDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
153  {
156  DevUniformCudaHipRt<TApi> const& dev) -> TElem const*
157  {
158  if(dev == getDev(buf))
159  {
160  return buf.m_spMem.get();
161  }
162  else
163  {
164  throw std::runtime_error("The buffer is not accessible from the given device!");
165  }
166  }
167 
170  DevUniformCudaHipRt<TApi> const& dev) -> TElem*
171  {
172  if(dev == getDev(buf))
173  {
174  return buf.m_spMem.get();
175  }
176  else
177  {
178  throw std::runtime_error("The buffer is not accessible from the given device!");
179  }
180  }
181  };
182 
183  template<typename TApi, typename TElem, typename TDim, typename TIdx>
184  struct GetPitchesInBytes<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
185  {
187  -> Vec<TDim, TIdx>
188  {
189  Vec<TDim, TIdx> v{};
190  if constexpr(TDim::value > 0)
191  {
192  v.back() = sizeof(TElem);
193  if constexpr(TDim::value > 1)
194  {
195  v[TDim::value - 2] = static_cast<TIdx>(buf.m_rowPitchInBytes);
196  for(TIdx i = TDim::value - 2; i > 0; i--)
197  v[i - 1] = buf.m_extentElements[i] * v[i];
198  }
199  }
200  return v;
201  }
202  };
203 
204  //! The CUDA/HIP memory allocation trait specialization.
205  template<typename TApi, typename TElem, typename Dim, typename TIdx>
206  struct BufAlloc<TElem, Dim, TIdx, DevUniformCudaHipRt<TApi>>
207  {
208  template<typename TExtent>
209  ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt<TApi> const& dev, TExtent const& extent)
211  {
213 
214  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
215 
216  void* memPtr = nullptr;
217  std::size_t rowPitchInBytes = 0u;
218  if(getExtentProduct(extent) != 0)
219  {
220  if constexpr(Dim::value == 0)
221  {
222  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc(&memPtr, sizeof(TElem)));
223  }
224  else if constexpr(Dim::value == 1)
225  {
227  TApi::malloc(&memPtr, static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem)));
228  }
229  else if constexpr(Dim::value == 2)
230  {
231  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocPitch(
232  &memPtr,
233  &rowPitchInBytes,
234  static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
235  static_cast<std::size_t>(getHeight(extent))));
236  }
237  else if constexpr(Dim::value == 3)
238  {
239  typename TApi::Extent_t const extentVal = TApi::makeExtent(
240  static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
241  static_cast<std::size_t>(getHeight(extent)),
242  static_cast<std::size_t>(getDepth(extent)));
243  typename TApi::PitchedPtr_t pitchedPtrVal;
244  pitchedPtrVal.ptr = nullptr;
245  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc3D(&pitchedPtrVal, extentVal));
246  memPtr = pitchedPtrVal.ptr;
247  rowPitchInBytes = pitchedPtrVal.pitch;
248  }
249  }
250 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
251  std::cout << __func__;
252  if constexpr(Dim::value >= 1)
253  std::cout << " ew: " << getWidth(extent);
254  if constexpr(Dim::value >= 2)
255  std::cout << " eh: " << getHeight(extent);
256  if constexpr(Dim::value >= 3)
257  std::cout << " ed: " << getDepth(extent);
258  std::cout << " ptr: " << memPtr;
259  if constexpr(Dim::value >= 2)
260  std::cout << " rowpitch: " << rowPitchInBytes;
261  std::cout << std::endl;
262 # endif
263  return {
264  dev,
265  reinterpret_cast<TElem*>(memPtr),
266  [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::free(ptr)); },
267  extent,
268  rowPitchInBytes};
269  }
270  };
271 
272  //! The CUDA/HIP stream-ordered memory allocation trait specialization.
273  template<typename TApi, typename TElem, typename TDim, typename TIdx>
274  struct AsyncBufAlloc<TElem, TDim, TIdx, DevUniformCudaHipRt<TApi>>
275  {
276 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
277  static_assert(
278  std::is_same_v<TApi, ApiCudaRt> && TApi::version >= BOOST_VERSION_NUMBER(11, 2, 0),
279  "Support for stream-ordered memory buffers requires CUDA 11.2 or higher.");
280 # endif
281 # if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
282  static_assert(
283  std::is_same_v<TApi, ApiHipRt> && TApi::version >= BOOST_VERSION_NUMBER(5, 3, 0),
284  "Support for stream-ordered memory buffers requires HIP/ROCm 5.3 or higher.");
285 # endif
286  static_assert(
287  TDim::value <= 1,
288  "CUDA/HIP devices support only one-dimensional stream-ordered memory buffers.");
289 
290  template<typename TQueue, typename TExtent>
291  ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, [[maybe_unused]] TExtent const& extent)
293  {
295 
296  static_assert(TDim::value == Dim<TExtent>::value, "extent must have the same dimension as the buffer");
297  auto const width = getExtentProduct(extent); // handles 1D and 0D buffers
298 
299  auto const& dev = getDev(queue);
300  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
301  void* memPtr = nullptr;
302  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocAsync(
303  &memPtr,
304  static_cast<std::size_t>(width) * sizeof(TElem),
305  queue.getNativeHandle()));
306 
307 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
308  std::cout << __func__ << " ew: " << width << " ptr: " << memPtr << std::endl;
309 # endif
310  return {
311  dev,
312  reinterpret_cast<TElem*>(memPtr),
313  [q = std::move(queue)](TElem* ptr)
314  { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, q.getNativeHandle())); },
315  extent,
316  static_cast<std::size_t>(width) * sizeof(TElem)};
317  }
318  };
319 
320  //! The CUDA/HIP stream-ordered memory allocation capability trait specialization.
321  template<typename TApi, typename TDim>
323  : std::bool_constant<
324  TDim::value <= 1
325  && (
326 # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
327  std::is_same_v<TApi, ApiCudaRt> && TApi::version >= BOOST_VERSION_NUMBER(11, 2, 0)
328 # elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
329  std::is_same_v<TApi, ApiHipRt> && TApi::version >= BOOST_VERSION_NUMBER(5, 3, 0)
330 # else
331  false
332 # endif
333  )>
334  {
335  };
336 
337  //! The pinned/mapped memory allocation trait specialization for the CUDA/HIP devices.
338  template<typename TApi, typename TElem, typename TDim, typename TIdx>
339  struct BufAllocMapped<PlatformUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
340  {
341  template<typename TExtent>
342  ALPAKA_FN_HOST static auto allocMappedBuf(
343  DevCpu const& host,
344  PlatformUniformCudaHipRt<TApi> const& /*platform*/,
345  TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
346  {
348 
349  // Allocate CUDA/HIP page-locked memory on the host, mapped into the CUDA/HIP address space and
350  // accessible to all CUDA/HIP devices.
351  TElem* memPtr = nullptr;
353  reinterpret_cast<void**>(&memPtr),
354  sizeof(TElem) * static_cast<std::size_t>(getExtentProduct(extent)),
355  TApi::hostMallocMapped | TApi::hostMallocPortable));
356  auto deleter = [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::hostFree(ptr)); };
357 
358  return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
359  }
360  };
361 
362  //! The pinned/mapped memory allocation capability trait specialization.
363  template<typename TApi>
364  struct HasMappedBufSupport<PlatformUniformCudaHipRt<TApi>> : public std::true_type
365  {
366  };
367 
368  //! The BufUniformCudaHipRt offset get trait specialization.
369  template<typename TApi, typename TElem, typename TDim, typename TIdx>
370  struct GetOffsets<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
371  {
373  -> Vec<TDim, TIdx>
374  {
375  return Vec<TDim, TIdx>::zeros();
376  }
377  };
378 
379  //! The BufUniformCudaHipRt idx type trait specialization.
380  template<typename TApi, typename TElem, typename TDim, typename TIdx>
381  struct IdxType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
382  {
383  using type = TIdx;
384  };
385 
386  //! The BufCpu pointer on CUDA/HIP device get trait specialization.
387  template<typename TApi, typename TElem, typename TDim, typename TIdx>
388  struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
389  {
391  BufCpu<TElem, TDim, TIdx> const& buf,
392  DevUniformCudaHipRt<TApi> const&) -> TElem const*
393  {
394  // TODO: Check if the memory is mapped at all!
395  TElem* pDev(nullptr);
396 
397  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(
398  &pDev,
399  const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
400  0));
401 
402  return pDev;
403  }
404 
406  -> TElem*
407  {
408  // TODO: Check if the memory is mapped at all!
409  TElem* pDev(nullptr);
410 
411  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(&pDev, getPtrNative(buf), 0));
412 
413  return pDev;
414  }
415  };
416  } // namespace trait
417 } // namespace alpaka
418 
421 
422 #endif
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd)
CUDA/HIP runtime error checking with log.
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CPU memory buffer.
Definition: BufCpu.hpp:90
The CPU device handle.
Definition: DevCpu.hpp:56
The CUDA/HIP RT device handle.
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto zeros() -> Vec< TDim, TVal >
Zero value constructor.
Definition: Vec.hpp:126
constexpr ALPAKA_FN_HOST_ACC auto back() -> TVal &
Definition: Vec.hpp:168
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition: Traits.hpp:29
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtentProduct(T const &object) -> Idx< T >
Definition: Traits.hpp:134
ALPAKA_FN_HOST auto free(TAlloc const &alloc, T const *const ptr) -> void
Frees the memory identified by the given pointer.
Definition: Traits.hpp:41
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getHeight(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:108
ALPAKA_FN_HOST auto malloc(TAlloc const &alloc, std::size_t const &sizeElems) -> T *
Definition: Traits.hpp:33
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtents(T const &object) -> Vec< Dim< T >, Idx< T >>
Definition: Traits.hpp:59
ALPAKA_FN_HOST auto allocMappedBuf(DevCpu const &host, TPlatform const &platform, TExtent const &extent=TExtent())
Allocates pinned/mapped host memory, accessible by all devices in the given platform.
Definition: Traits.hpp:138
ALPAKA_FN_HOST auto getPtrNative(TView const &view) -> Elem< TView > const *
Gets the native pointer of the memory view.
Definition: Traits.hpp:136
ALPAKA_FN_HOST auto getDev(T const &t)
Definition: Traits.hpp:68
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getDepth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:121
typename trait::DimType< T >::type Dim
The dimension type trait alias template to remove the ::type.
Definition: Traits.hpp:19
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWidth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:95
The CUDA/HIP memory buffer.
DevUniformCudaHipRt< TApi > m_dev
ALPAKA_FN_HOST BufUniformCudaHipRt(DevUniformCudaHipRt< TApi > const &dev, TElem *const pMem, Deleter deleter, TExtent const &extent, std::size_t pitchBytes)
Constructor.
std::shared_ptr< TElem > m_spMem
static ALPAKA_FN_HOST auto allocAsyncBuf(TQueue queue, [[maybe_unused]] TExtent const &extent) -> BufUniformCudaHipRt< TApi, TElem, TDim, TIdx >
The stream-ordered memory allocator trait.
Definition: Traits.hpp:31
The pinned/mapped memory allocator trait.
Definition: Traits.hpp:41
static ALPAKA_FN_HOST auto allocBuf(DevUniformCudaHipRt< TApi > const &dev, TExtent const &extent) -> BufUniformCudaHipRt< TApi, TElem, Dim, TIdx >
The memory allocator trait.
Definition: Traits.hpp:27
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
The element type trait.
Definition: Traits.hpp:16
static ALPAKA_FN_HOST auto getDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition: Traits.hpp:27
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buffer) const
The GetExtents trait for getting the extents of an object as an alpaka::Vec.
Definition: Traits.hpp:37
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &) const -> Vec< TDim, TIdx >
The GetOffsets trait for getting the offsets of an object as an alpaka::Vec.
Definition: Traits.hpp:33
ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) const -> Vec< TDim, TIdx >
Customization point for getPitchesInBytes. The default implementation uses the extent to calculate th...
Definition: Traits.hpp:103
static ALPAKA_FN_HOST auto getPtrDev(BufCpu< TElem, TDim, TIdx > const &buf, DevUniformCudaHipRt< TApi > const &) -> TElem const *
static ALPAKA_FN_HOST auto getPtrDev(BufCpu< TElem, TDim, TIdx > &buf, DevUniformCudaHipRt< TApi > const &) -> TElem *
static ALPAKA_FN_HOST auto getPtrDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf, DevUniformCudaHipRt< TApi > const &dev) -> TElem const *
static ALPAKA_FN_HOST auto getPtrDev(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &buf, DevUniformCudaHipRt< TApi > const &dev) -> TElem *
The pointer on device get trait.
Definition: Traits.hpp:58
static ALPAKA_FN_HOST auto getPtrNative(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > &buf) -> TElem *
static ALPAKA_FN_HOST auto getPtrNative(BufUniformCudaHipRt< TApi, TElem, TDim, TIdx > const &buf) -> TElem const *
The native pointer get trait.
Definition: Traits.hpp:54
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostMalloc(reinterpret_cast< void ** >(&memPtr), sizeof(TElem) *static_cast< std::size_t >(getExtentProduct(extent)), TApi::hostMallocMapped|TApi::hostMallocPortable))
The stream-ordered memory allocation capability trait.
Definition: Traits.hpp:36
The pinned/mapped memory allocation capability trait.
Definition: Traits.hpp:46
The idx type trait.
Definition: Traits.hpp:25