alpaka
Abstraction Library for Parallel Kernel Acceleration
Copy.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
2  * Bernhard Manfred Gruber, Antonio Di Pilato
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
8 #include "alpaka/core/Assert.hpp"
9 #include "alpaka/core/Cuda.hpp"
10 #include "alpaka/core/Hip.hpp"
11 #include "alpaka/dev/DevCpu.hpp"
14 #include "alpaka/extent/Traits.hpp"
18 
19 #include <cstddef>
20 #include <cstdint>
21 #include <set>
22 #include <tuple>
23 #include <type_traits>
24 
25 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
26 
27 namespace alpaka
28 {
29  namespace detail
30  {
31  //! The CUDA/HIP memory copy trait.
32  template<typename TApi, typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
34 
35  //! The scalar CUDA/HIP memory copy trait.
36  template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
37  struct TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>
38  {
40 
41  template<typename TViewDstFwd>
43  TViewDstFwd&& viewDst,
44  TViewSrc const& viewSrc,
45  [[maybe_unused]] TExtent const& extent,
46  typename TApi::MemcpyKind_t const& uniformMemCpyKind,
47  int const& iDstDevice,
48  int const& iSrcDevice)
49  : m_uniformMemCpyKind(uniformMemCpyKind)
50  , m_iDstDevice(iDstDevice)
51  , m_iSrcDevice(iSrcDevice)
52  , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
53  , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
54  {
55 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
56  ALPAKA_ASSERT(getExtentProduct(extent) == 1);
57 # endif
58  }
59 
60  template<typename TQueue>
61  auto enqueue(TQueue& queue) const -> void
62  {
63 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
64  printDebug();
65 # endif
66  // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
67  // see https://github.com/fwyzard/nvidia_bug_3446335 .
68  // Set the current device.
69  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
70  // Initiate the memory copy.
71  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
72  m_dstMemNative,
73  m_srcMemNative,
74  sizeof(Elem<TViewDst>),
75  m_uniformMemCpyKind,
76  queue.getNativeHandle()));
77  }
78 
79  private:
80 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
81  ALPAKA_FN_HOST auto printDebug() const -> void
82  {
83  std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << Idx(1u)
84  << " ewb: " << static_cast<Idx>(sizeof(Elem<TViewDst>)) << " dw: " << Idx(1u)
85  << " dptr: " << m_dstMemNative << " sdev: " << m_iSrcDevice << " sw: " << Idx(1u)
86  << " sptr: " << m_srcMemNative << std::endl;
87  }
88 # endif
89 
90  typename TApi::MemcpyKind_t m_uniformMemCpyKind;
91  int m_iDstDevice;
92  int m_iSrcDevice;
93  void* m_dstMemNative;
94  void const* m_srcMemNative;
95  };
96 
97  //! The 1D CUDA/HIP memory copy trait.
98  template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
99  struct TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>
100  {
102 
103  template<typename TViewDstFwd>
105  TViewDstFwd&& viewDst,
106  TViewSrc const& viewSrc,
107  TExtent const& extent,
108  typename TApi::MemcpyKind_t const& uniformMemCpyKind,
109  int const& iDstDevice,
110  int const& iSrcDevice)
111  : m_uniformMemCpyKind(uniformMemCpyKind)
112  , m_iDstDevice(iDstDevice)
113  , m_iSrcDevice(iSrcDevice)
115  , m_extentWidth(getWidth(extent))
116  , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
117  , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
118 # endif
119  , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
120  , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
121  , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
122  {
123 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
124  ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
125  ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
126 # endif
127  }
128 
129  template<typename TQueue>
130  auto enqueue(TQueue& queue) const -> void
131  {
132 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
133  printDebug();
134 # endif
135  if(m_extentWidthBytes == std::size_t{0})
136  {
137  return;
138  }
139 
140  // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
141  // see https://github.com/fwyzard/nvidia_bug_3446335 .
142  // Set the current device.
143  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
144  // Initiate the memory copy.
145  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
146  m_dstMemNative,
147  m_srcMemNative,
148  m_extentWidthBytes,
149  m_uniformMemCpyKind,
150  queue.getNativeHandle()));
151  }
152 
153  private:
154 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
155  ALPAKA_FN_HOST auto printDebug() const -> void
156  {
157  std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extentWidth
158  << " ewb: " << m_extentWidthBytes << " dw: " << m_dstWidth << " dptr: " << m_dstMemNative
159  << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sptr: " << m_srcMemNative
160  << std::endl;
161  }
162 # endif
163 
164  typename TApi::MemcpyKind_t m_uniformMemCpyKind;
165  int m_iDstDevice;
166  int m_iSrcDevice;
167 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
168  Idx m_extentWidth;
169  Idx m_dstWidth;
170  Idx m_srcWidth;
171 # endif
172  std::size_t m_extentWidthBytes;
173  void* m_dstMemNative;
174  void const* m_srcMemNative;
175  };
176 
177  //! The 2D CUDA/HIP memory copy trait.
178  template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
179  struct TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>
180  {
182 
183  template<typename TViewDstFwd>
185  TViewDstFwd&& viewDst,
186  TViewSrc const& viewSrc,
187  TExtent const& extent,
188  typename TApi::MemcpyKind_t const& uniformMemcpyKind,
189  int const& iDstDevice,
190  int const& iSrcDevice)
191  : m_uniformMemCpyKind(uniformMemcpyKind)
192  , m_iDstDevice(iDstDevice)
193  , m_iSrcDevice(iSrcDevice)
195  , m_extentWidth(getWidth(extent))
196 # endif
197  , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
198  , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
199  , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
200  , m_extentHeight(getHeight(extent))
202  , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
203  , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
204 # endif
205  , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
206  , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
207  , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
208  , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
209  {
210 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
211  ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
212  ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
213  ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
214  ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
215  ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
216  ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
217 # endif
218  }
219 
220  template<typename TQueue>
221  auto enqueue(TQueue& queue) const -> void
222  {
223 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
224  printDebug();
225 # endif
226  // This is not only an optimization but also prevents a division by zero.
227  if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0)
228  {
229  return;
230  }
231 
232  // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
233  // see https://github.com/fwyzard/nvidia_bug_3446335 .
234  // Set the current device.
235  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
236  // Initiate the memory copy.
237  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpy2DAsync(
238  m_dstMemNative,
239  m_dstRowPitchBytes,
240  m_srcMemNative,
241  m_srcRowPitchBytes,
242  m_extentWidthBytes,
243  static_cast<std::size_t>(m_extentHeight),
244  m_uniformMemCpyKind,
245  queue.getNativeHandle()));
246  }
247 
248  private:
249 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
250  ALPAKA_FN_HOST auto printDebug() const -> void
251  {
252  std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
253  << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth
254  << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitch: " << m_dstRowPitchBytes
255  << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight
256  << " sptr: " << m_srcMemNative << " spitch: " << m_srcRowPitchBytes << std::endl;
257  }
258 # endif
259 
260  typename TApi::MemcpyKind_t m_uniformMemCpyKind;
261  int m_iDstDevice;
262  int m_iSrcDevice;
263 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
264  Idx m_extentWidth;
265 # endif
266  std::size_t m_extentWidthBytes;
267  Idx m_dstWidth;
268  Idx m_srcWidth;
269 
270  Idx m_extentHeight;
271 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
272  Idx m_dstHeight;
273  Idx m_srcHeight;
274 # endif
275  std::size_t m_dstRowPitchBytes;
276  std::size_t m_srcRowPitchBytes;
277 
278  void* m_dstMemNative;
279  void const* m_srcMemNative;
280  };
281 
282  //! The 3D CUDA/HIP memory copy trait.
283  template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
284  struct TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>
285  {
287 
288  template<typename TViewDstFwd>
290  TViewDstFwd&& viewDst,
291  TViewSrc const& viewSrc,
292  TExtent const& extent,
293  typename TApi::MemcpyKind_t const& uniformMemcpyKind,
294  int const& iDstDevice,
295  int const& iSrcDevice)
296  : m_uniformMemCpyKind(uniformMemcpyKind)
297  , m_iDstDevice(iDstDevice)
298  , m_iSrcDevice(iSrcDevice)
299  , m_extentWidth(getWidth(extent))
300  , m_extentWidthBytes(static_cast<std::size_t>(m_extentWidth) * sizeof(Elem<TViewDst>))
301  , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
302  , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
303  , m_extentHeight(getHeight(extent))
304  , m_extentDepth(getDepth(extent))
306  , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
307  , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
308  , m_dstDepth(static_cast<Idx>(getDepth(viewDst)))
309  , m_srcDepth(static_cast<Idx>(getDepth(viewSrc)))
310 # endif
311  , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[1]))
312  , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[1]))
313  , m_dstSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
314  , m_srcSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
315  , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
316  , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
317  {
318 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
319  ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
320  ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
321  ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
322  ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
323  ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
324  ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
325  ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
326  ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
327 # endif
328  }
329 
330  template<typename TQueue>
331  auto enqueue(TQueue& queue) const -> void
332  {
333 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
334  printDebug();
335 # endif
336  // This is not only an optimization but also prevents a division by zero.
337  if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0 || m_extentDepth == 0)
338  {
339  return;
340  }
341 
342  // Create the struct describing the copy.
343  typename TApi::Memcpy3DParms_t const uniformCudaHipMemCpy3DParms(buildUniformCudaHipMemcpy3DParms());
344 
345  // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
346  // see https://github.com/fwyzard/nvidia_bug_3446335 .
347  // Set the current device.
348  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
349 
351  TApi::memcpy3DAsync(&uniformCudaHipMemCpy3DParms, queue.getNativeHandle()));
352  }
353 
354  private:
355  ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms() const -> typename TApi::Memcpy3DParms_t
356  {
358 
359  // Fill CUDA/HIP parameter structure.
360  typename TApi::Memcpy3DParms_t memCpy3DParms{}; // zero-init required per CUDA documentation
361  memCpy3DParms.srcPtr = TApi::makePitchedPtr(
362  const_cast<void*>(m_srcMemNative),
363  m_srcRowPitchBytes,
364  static_cast<std::size_t>(m_srcWidth),
365  m_srcSlicePitchBytes / m_srcRowPitchBytes);
366  memCpy3DParms.dstPtr = TApi::makePitchedPtr(
367  m_dstMemNative,
368  m_dstRowPitchBytes,
369  static_cast<std::size_t>(m_dstWidth),
370  m_dstSlicePitchBytes / m_dstRowPitchBytes);
371  memCpy3DParms.extent = TApi::makeExtent(
372  m_extentWidthBytes,
373  static_cast<std::size_t>(m_extentHeight),
374  static_cast<std::size_t>(m_extentDepth));
375  memCpy3DParms.kind = m_uniformMemCpyKind;
376  return memCpy3DParms;
377  }
378 
379 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
380  ALPAKA_FN_HOST auto printDebug() const -> void
381  {
382  std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
383  << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice
384  << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth
385  << " dptr: " << m_dstMemNative << " drowpitch: " << m_dstRowPitchBytes
386  << " dslicepitch: " << m_dstSlicePitchBytes << " sdev: " << m_iSrcDevice
387  << " sw: " << m_srcWidth << " sh: " << m_srcHeight << " sd: " << m_srcDepth
388  << " sptr: " << m_srcMemNative << " srowpitch: " << m_srcRowPitchBytes
389  << " sslicepitch: " << m_srcSlicePitchBytes << std::endl;
390  }
391 # endif
392  typename TApi::MemcpyKind_t m_uniformMemCpyKind;
393  int m_iDstDevice;
394  int m_iSrcDevice;
395 
396  Idx m_extentWidth;
397  std::size_t m_extentWidthBytes;
398  Idx m_dstWidth;
399  Idx m_srcWidth;
400 
401  Idx m_extentHeight;
402  Idx m_extentDepth;
403 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
404  Idx m_dstHeight;
405  Idx m_srcHeight;
406  Idx m_dstDepth;
407  Idx m_srcDepth;
408 # endif
409  std::size_t m_dstRowPitchBytes;
410  std::size_t m_srcRowPitchBytes;
411  std::size_t m_dstSlicePitchBytes;
412  std::size_t m_srcSlicePitchBytes;
413 
414  void* m_dstMemNative;
415  void const* m_srcMemNative;
416  };
417  } // namespace detail
418 
419  // Trait specializations for CreateTaskMemcpy.
420  namespace trait
421  {
422  //! The CUDA/HIP to CPU memory copy trait specialization.
423  template<typename TApi, typename TDim>
425  {
426  template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
428  TViewDstFwd&& viewDst,
429  TViewSrc const& viewSrc,
430  TExtent const& extent) -> alpaka::detail::
431  TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
432  {
434 
435  auto const iDevice = getDev(viewSrc).getNativeHandle();
436 
437  return {
438  std::forward<TViewDstFwd>(viewDst),
439  viewSrc,
440  extent,
441  TApi::memcpyDeviceToHost,
442  iDevice,
443  iDevice};
444  }
445  };
446 
447  //! The CPU to CUDA/HIP memory copy trait specialization.
448  template<typename TApi, typename TDim>
450  {
451  template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
453  TViewDstFwd&& viewDst,
454  TViewSrc const& viewSrc,
455  TExtent const& extent) -> alpaka::detail::
456  TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
457  {
459 
460  auto const iDevice = getDev(viewDst).getNativeHandle();
461 
462  return {
463  std::forward<TViewDstFwd>(viewDst),
464  viewSrc,
465  extent,
466  TApi::memcpyHostToDevice,
467  iDevice,
468  iDevice};
469  }
470  };
471 
472  //! The CUDA/HIP to CUDA/HIP memory copy trait specialization.
473  template<typename TApi, typename TDim>
475  {
476  template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
478  TViewDstFwd&& viewDst,
479  TViewSrc const& viewSrc,
480  TExtent const& extent) -> alpaka::detail::
481  TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
482  {
484 
485  auto const iDstDevice = getDev(viewDst).getNativeHandle();
486 
487  return {
488  std::forward<TViewDstFwd>(viewDst),
489  viewSrc,
490  extent,
491  TApi::memcpyDeviceToDevice,
492  iDstDevice,
493  getDev(viewSrc).getNativeHandle()};
494  }
495  };
496 
497  //! The CUDA/HIP non-blocking device queue scalar copy enqueue trait specialization.
498  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
499  struct Enqueue<
501  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
502  {
503  ALPAKA_FN_HOST static auto enqueue(
505  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
506  -> void
507  {
509 
510  task.enqueue(queue);
511  }
512  };
513 
514  //! The CUDA/HIP blocking device queue scalar copy enqueue trait specialization.
515  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
516  struct Enqueue<
518  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
519  {
520  ALPAKA_FN_HOST static auto enqueue(
522  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
523  -> void
524  {
526 
527  task.enqueue(queue);
528 
529  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
530  }
531  };
532 
533  //! The CUDA/HIP non-blocking device queue 1D copy enqueue trait specialization.
534  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
535  struct Enqueue<
537  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
538  {
539  ALPAKA_FN_HOST static auto enqueue(
541  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
542  -> void
543  {
545 
546  task.enqueue(queue);
547  }
548  };
549 
550  //! The CUDA/HIP blocking device queue 1D copy enqueue trait specialization.
551  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
552  struct Enqueue<
554  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
555  {
556  ALPAKA_FN_HOST static auto enqueue(
558  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
559  -> void
560  {
562 
563  task.enqueue(queue);
564 
565  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
566  }
567  };
568 
569  //! The CUDA/HIP non-blocking device queue 2D copy enqueue trait specialization.
570  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
571  struct Enqueue<
573  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
574  {
575  ALPAKA_FN_HOST static auto enqueue(
577  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
578  -> void
579  {
581 
582  task.enqueue(queue);
583  }
584  };
585 
586  //! The CUDA/HIP blocking device queue 2D copy enqueue trait specialization.
587  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
588  struct Enqueue<
590  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
591  {
592  ALPAKA_FN_HOST static auto enqueue(
594  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
595  -> void
596  {
598 
599  task.enqueue(queue);
600 
601  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
602  }
603  };
604 
605  //! The CUDA/HIP non-blocking device queue 3D copy enqueue trait specialization.
606  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
607  struct Enqueue<
609  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
610  {
611  ALPAKA_FN_HOST static auto enqueue(
613  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
614  -> void
615  {
617 
618  task.enqueue(queue);
619  }
620  };
621 
622  //! The CUDA/HIP blocking device queue 3D copy enqueue trait specialization.
623  template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
624  struct Enqueue<
626  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
627  {
628  ALPAKA_FN_HOST static auto enqueue(
630  alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
631  -> void
632  {
634 
635  task.enqueue(queue);
636 
637  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
638  }
639  };
640  } // namespace trait
641 } // namespace alpaka
642 
643 #endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition: Assert.hpp:13
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition: Debug.hpp:22
#define ALPAKA_DEBUG_FULL_LOG_SCOPE
Definition: Debug.hpp:62
#define ALPAKA_DEBUG_FULL
The full debug level.
Definition: Debug.hpp:18
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CPU device handle.
Definition: DevCpu.hpp:56
The CUDA/HIP RT device handle.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition: Traits.hpp:29
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtentProduct(T const &object) -> Idx< T >
Definition: Traits.hpp:134
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getHeight(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:108
ALPAKA_FN_HOST auto getPitchesInBytes(TView const &view) -> Vec< Dim< TView >, Idx< TView >>
Definition: Traits.hpp:196
ALPAKA_FN_HOST auto getPtrNative(TView const &view) -> Elem< TView > const *
Gets the native pointer of the memory view.
Definition: Traits.hpp:136
std::remove_volatile_t< typename trait::ElemType< TView >::type > Elem
The element type trait alias template to remove the ::type.
Definition: Traits.hpp:21
ALPAKA_FN_HOST auto getDev(T const &t)
Definition: Traits.hpp:68
std::integral_constant< std::size_t, N > DimInt
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getDepth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWidth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:95
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, [[maybe_unused]] TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemCpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition: Copy.hpp:42
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemCpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition: Copy.hpp:104
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemcpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition: Copy.hpp:184
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemcpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition: Copy.hpp:289
The CUDA/HIP memory copy trait.
Definition: Copy.hpp:33
static ALPAKA_FN_HOST auto createTaskMemcpy(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent) -> alpaka::detail::TaskCopyUniformCudaHip< TApi, TDim, std::remove_reference_t< TViewDstFwd >, TViewSrc, TExtent >
Definition: Copy.hpp:427
static ALPAKA_FN_HOST auto createTaskMemcpy(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent) -> alpaka::detail::TaskCopyUniformCudaHip< TApi, TDim, std::remove_reference_t< TViewDstFwd >, TViewSrc, TExtent >
Definition: Copy.hpp:477
static ALPAKA_FN_HOST auto createTaskMemcpy(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent) -> alpaka::detail::TaskCopyUniformCudaHip< TApi, TDim, std::remove_reference_t< TViewDstFwd >, TViewSrc, TExtent >
Definition: Copy.hpp:452
The memory copy task trait.
Definition: Traits.hpp:120
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 3u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:628
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 0u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:520
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 1u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:556
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 2u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:592
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 1u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:539
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 2u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:575
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 3u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:611
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskCopyUniformCudaHip< TApi, DimInt< 0u >, TViewDst, TViewSrc, TExtent > const &task) -> void
Definition: Copy.hpp:503
The queue enqueue trait.
Definition: Traits.hpp:27