alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Copy.hpp
Go to the documentation of this file.
1/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
2 * Bernhard Manfred Gruber, Antonio Di Pilato
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
10#include "alpaka/core/Hip.hpp"
11#include "alpaka/dev/DevCpu.hpp"
18
19#include <cstddef>
20#include <cstdint>
21#include <set>
22#include <tuple>
23#include <type_traits>
24
25#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
26
27namespace alpaka
28{
29 namespace detail
30 {
31 //! The CUDA/HIP memory copy trait.
32 template<typename TApi, typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
34
35 //! The scalar CUDA/HIP memory copy trait.
36 template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
37 struct TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>
38 {
40
41 template<typename TViewDstFwd>
43 TViewDstFwd&& viewDst,
44 TViewSrc const& viewSrc,
45 [[maybe_unused]] TExtent const& extent,
46 typename TApi::MemcpyKind_t const& uniformMemCpyKind,
47 int const& iDstDevice,
48 int const& iSrcDevice)
49 : m_uniformMemCpyKind(uniformMemCpyKind)
50 , m_iDstDevice(iDstDevice)
51 , m_iSrcDevice(iSrcDevice)
52 , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
53 , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
54 {
55# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
56 ALPAKA_ASSERT(getExtentProduct(extent) == 1);
57# endif
58 }
59
60 template<typename TQueue>
61 auto enqueue(TQueue& queue) const -> void
62 {
63# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
64 printDebug();
65# endif
66 // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
67 // see https://github.com/fwyzard/nvidia_bug_3446335 .
68 // Set the current device.
69 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
70 // Initiate the memory copy.
71 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
72 m_dstMemNative,
73 m_srcMemNative,
74 sizeof(Elem<TViewDst>),
75 m_uniformMemCpyKind,
76 queue.getNativeHandle()));
77 }
78
79 private:
80# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
81 ALPAKA_FN_HOST auto printDebug() const -> void
82 {
83 std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << Idx(1u)
84 << " ewb: " << static_cast<Idx>(sizeof(Elem<TViewDst>)) << " dw: " << Idx(1u)
85 << " dptr: " << m_dstMemNative << " sdev: " << m_iSrcDevice << " sw: " << Idx(1u)
86 << " sptr: " << m_srcMemNative << std::endl;
87 }
88# endif
89
90 typename TApi::MemcpyKind_t m_uniformMemCpyKind;
91 int m_iDstDevice;
92 int m_iSrcDevice;
93 void* m_dstMemNative;
94 void const* m_srcMemNative;
95 };
96
97 //! The 1D CUDA/HIP memory copy trait.
98 template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
99 struct TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>
100 {
102
103 template<typename TViewDstFwd>
105 TViewDstFwd&& viewDst,
106 TViewSrc const& viewSrc,
107 TExtent const& extent,
108 typename TApi::MemcpyKind_t const& uniformMemCpyKind,
109 int const& iDstDevice,
110 int const& iSrcDevice)
111 : m_uniformMemCpyKind(uniformMemCpyKind)
112 , m_iDstDevice(iDstDevice)
113 , m_iSrcDevice(iSrcDevice)
115 , m_extentWidth(getWidth(extent))
116 , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
117 , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
118# endif
119 , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
120 , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
121 , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
122 {
123# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
124 ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
125 ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
126# endif
127 }
128
129 template<typename TQueue>
130 auto enqueue(TQueue& queue) const -> void
131 {
132# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
133 printDebug();
134# endif
135 if(m_extentWidthBytes == std::size_t{0})
136 {
137 return;
138 }
139
140 // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
141 // see https://github.com/fwyzard/nvidia_bug_3446335 .
142 // Set the current device.
143 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
144 // Initiate the memory copy.
145 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
146 m_dstMemNative,
147 m_srcMemNative,
148 m_extentWidthBytes,
149 m_uniformMemCpyKind,
150 queue.getNativeHandle()));
151 }
152
153 private:
154# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
155 ALPAKA_FN_HOST auto printDebug() const -> void
156 {
157 std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extentWidth
158 << " ewb: " << m_extentWidthBytes << " dw: " << m_dstWidth << " dptr: " << m_dstMemNative
159 << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sptr: " << m_srcMemNative
160 << std::endl;
161 }
162# endif
163
164 typename TApi::MemcpyKind_t m_uniformMemCpyKind;
165 int m_iDstDevice;
166 int m_iSrcDevice;
167# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
168 Idx m_extentWidth;
169 Idx m_dstWidth;
170 Idx m_srcWidth;
171# endif
172 std::size_t m_extentWidthBytes;
173 void* m_dstMemNative;
174 void const* m_srcMemNative;
175 };
176
177 //! The 2D CUDA/HIP memory copy trait.
178 template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
179 struct TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>
180 {
182
183 template<typename TViewDstFwd>
185 TViewDstFwd&& viewDst,
186 TViewSrc const& viewSrc,
187 TExtent const& extent,
188 typename TApi::MemcpyKind_t const& uniformMemcpyKind,
189 int const& iDstDevice,
190 int const& iSrcDevice)
191 : m_uniformMemCpyKind(uniformMemcpyKind)
192 , m_iDstDevice(iDstDevice)
193 , m_iSrcDevice(iSrcDevice)
195 , m_extentWidth(getWidth(extent))
196# endif
197 , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
198 , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
199 , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
200 , m_extentHeight(getHeight(extent))
202 , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
203 , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
204# endif
205 , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
206 , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
207 , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
208 , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
209 {
210# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
211 ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
212 ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
213 ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
214 ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
215 ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
216 ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
217# endif
218 }
219
220 template<typename TQueue>
221 auto enqueue(TQueue& queue) const -> void
222 {
223# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
224 printDebug();
225# endif
226 // This is not only an optimization but also prevents a division by zero.
227 if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0)
228 {
229 return;
230 }
231
232 // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
233 // see https://github.com/fwyzard/nvidia_bug_3446335 .
234 // Set the current device.
235 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
236 // Initiate the memory copy.
237 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpy2DAsync(
238 m_dstMemNative,
239 m_dstRowPitchBytes,
240 m_srcMemNative,
241 m_srcRowPitchBytes,
242 m_extentWidthBytes,
243 static_cast<std::size_t>(m_extentHeight),
244 m_uniformMemCpyKind,
245 queue.getNativeHandle()));
246 }
247
248 private:
249# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
250 ALPAKA_FN_HOST auto printDebug() const -> void
251 {
252 std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
253 << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth
254 << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitch: " << m_dstRowPitchBytes
255 << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight
256 << " sptr: " << m_srcMemNative << " spitch: " << m_srcRowPitchBytes << std::endl;
257 }
258# endif
259
260 typename TApi::MemcpyKind_t m_uniformMemCpyKind;
261 int m_iDstDevice;
262 int m_iSrcDevice;
263# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
264 Idx m_extentWidth;
265# endif
266 std::size_t m_extentWidthBytes;
267 Idx m_dstWidth;
268 Idx m_srcWidth;
269
270 Idx m_extentHeight;
271# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
272 Idx m_dstHeight;
273 Idx m_srcHeight;
274# endif
275 std::size_t m_dstRowPitchBytes;
276 std::size_t m_srcRowPitchBytes;
277
278 void* m_dstMemNative;
279 void const* m_srcMemNative;
280 };
281
282 //! The 3D CUDA/HIP memory copy trait.
283 template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
284 struct TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>
285 {
287
288 template<typename TViewDstFwd>
290 TViewDstFwd&& viewDst,
291 TViewSrc const& viewSrc,
292 TExtent const& extent,
293 typename TApi::MemcpyKind_t const& uniformMemcpyKind,
294 int const& iDstDevice,
295 int const& iSrcDevice)
296 : m_uniformMemCpyKind(uniformMemcpyKind)
297 , m_iDstDevice(iDstDevice)
298 , m_iSrcDevice(iSrcDevice)
299 , m_extentWidth(getWidth(extent))
300 , m_extentWidthBytes(static_cast<std::size_t>(m_extentWidth) * sizeof(Elem<TViewDst>))
301 , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
302 , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
303 , m_extentHeight(getHeight(extent))
304 , m_extentDepth(getDepth(extent))
306 , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
307 , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
308 , m_dstDepth(static_cast<Idx>(getDepth(viewDst)))
309 , m_srcDepth(static_cast<Idx>(getDepth(viewSrc)))
310# endif
311 , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[1]))
312 , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[1]))
313 , m_dstSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
314 , m_srcSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
315 , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
316 , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
317 {
318# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
319 ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
320 ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
321 ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
322 ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
323 ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
324 ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
325 ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
326 ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
327# endif
328 }
329
330 template<typename TQueue>
331 auto enqueue(TQueue& queue) const -> void
332 {
333# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
334 printDebug();
335# endif
336 // This is not only an optimization but also prevents a division by zero.
337 if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0 || m_extentDepth == 0)
338 {
339 return;
340 }
341
342 // Create the struct describing the copy.
343 typename TApi::Memcpy3DParms_t const uniformCudaHipMemCpy3DParms(buildUniformCudaHipMemcpy3DParms());
344
345 // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
346 // see https://github.com/fwyzard/nvidia_bug_3446335 .
347 // Set the current device.
348 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
349
351 TApi::memcpy3DAsync(&uniformCudaHipMemCpy3DParms, queue.getNativeHandle()));
352 }
353
354 private:
355 ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms() const -> typename TApi::Memcpy3DParms_t
356 {
358
359 // Fill CUDA/HIP parameter structure.
360 typename TApi::Memcpy3DParms_t memCpy3DParms{}; // zero-init required per CUDA documentation
361 memCpy3DParms.srcPtr = TApi::makePitchedPtr(
362 const_cast<void*>(m_srcMemNative),
363 m_srcRowPitchBytes,
364 static_cast<std::size_t>(m_srcWidth),
365 m_srcSlicePitchBytes / m_srcRowPitchBytes);
366 memCpy3DParms.dstPtr = TApi::makePitchedPtr(
367 m_dstMemNative,
368 m_dstRowPitchBytes,
369 static_cast<std::size_t>(m_dstWidth),
370 m_dstSlicePitchBytes / m_dstRowPitchBytes);
371 memCpy3DParms.extent = TApi::makeExtent(
372 m_extentWidthBytes,
373 static_cast<std::size_t>(m_extentHeight),
374 static_cast<std::size_t>(m_extentDepth));
375 memCpy3DParms.kind = m_uniformMemCpyKind;
376 return memCpy3DParms;
377 }
378
379# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
380 ALPAKA_FN_HOST auto printDebug() const -> void
381 {
382 std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
383 << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice
384 << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth
385 << " dptr: " << m_dstMemNative << " drowpitch: " << m_dstRowPitchBytes
386 << " dslicepitch: " << m_dstSlicePitchBytes << " sdev: " << m_iSrcDevice
387 << " sw: " << m_srcWidth << " sh: " << m_srcHeight << " sd: " << m_srcDepth
388 << " sptr: " << m_srcMemNative << " srowpitch: " << m_srcRowPitchBytes
389 << " sslicepitch: " << m_srcSlicePitchBytes << std::endl;
390 }
391# endif
392 typename TApi::MemcpyKind_t m_uniformMemCpyKind;
393 int m_iDstDevice;
394 int m_iSrcDevice;
395
396 Idx m_extentWidth;
397 std::size_t m_extentWidthBytes;
398 Idx m_dstWidth;
399 Idx m_srcWidth;
400
401 Idx m_extentHeight;
402 Idx m_extentDepth;
403# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
404 Idx m_dstHeight;
405 Idx m_srcHeight;
406 Idx m_dstDepth;
407 Idx m_srcDepth;
408# endif
409 std::size_t m_dstRowPitchBytes;
410 std::size_t m_srcRowPitchBytes;
411 std::size_t m_dstSlicePitchBytes;
412 std::size_t m_srcSlicePitchBytes;
413
414 void* m_dstMemNative;
415 void const* m_srcMemNative;
416 };
417 } // namespace detail
418
419 // Trait specializations for CreateTaskMemcpy.
420 namespace trait
421 {
422 //! The CUDA/HIP to CPU memory copy trait specialization.
423 template<typename TApi, typename TDim>
424 struct CreateTaskMemcpy<TDim, DevCpu, DevUniformCudaHipRt<TApi>>
425 {
426 template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
428 TViewDstFwd&& viewDst,
429 TViewSrc const& viewSrc,
430 TExtent const& extent) -> alpaka::detail::
431 TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
432 {
434
435 auto const iDevice = getDev(viewSrc).getNativeHandle();
436
437 return {
438 std::forward<TViewDstFwd>(viewDst),
439 viewSrc,
440 extent,
441 TApi::memcpyDeviceToHost,
442 iDevice,
443 iDevice};
444 }
445 };
446
447 //! The CPU to CUDA/HIP memory copy trait specialization.
448 template<typename TApi, typename TDim>
449 struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt<TApi>, DevCpu>
450 {
451 template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
453 TViewDstFwd&& viewDst,
454 TViewSrc const& viewSrc,
455 TExtent const& extent) -> alpaka::detail::
456 TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
457 {
459
460 auto const iDevice = getDev(viewDst).getNativeHandle();
461
462 return {
463 std::forward<TViewDstFwd>(viewDst),
464 viewSrc,
465 extent,
466 TApi::memcpyHostToDevice,
467 iDevice,
468 iDevice};
469 }
470 };
471
472 //! The CUDA/HIP to CUDA/HIP memory copy trait specialization.
473 template<typename TApi, typename TDim>
474 struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt<TApi>, DevUniformCudaHipRt<TApi>>
475 {
476 template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
478 TViewDstFwd&& viewDst,
479 TViewSrc const& viewSrc,
480 TExtent const& extent) -> alpaka::detail::
481 TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
482 {
484
485 auto const iDstDevice = getDev(viewDst).getNativeHandle();
486
487 return {
488 std::forward<TViewDstFwd>(viewDst),
489 viewSrc,
490 extent,
491 TApi::memcpyDeviceToDevice,
492 iDstDevice,
493 getDev(viewSrc).getNativeHandle()};
494 }
495 };
496
497 //! The CUDA/HIP non-blocking device queue scalar copy enqueue trait specialization.
498 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
499 struct Enqueue<
501 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
502 {
503 ALPAKA_FN_HOST static auto enqueue(
504 QueueUniformCudaHipRtNonBlocking<TApi>& queue,
505 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
506 -> void
507 {
509
510 task.enqueue(queue);
511 }
512 };
513
514 //! The CUDA/HIP blocking device queue scalar copy enqueue trait specialization.
515 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
516 struct Enqueue<
518 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
519 {
520 ALPAKA_FN_HOST static auto enqueue(
521 QueueUniformCudaHipRtBlocking<TApi>& queue,
522 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
523 -> void
524 {
526
527 task.enqueue(queue);
528
529 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
530 }
531 };
532
533 //! The CUDA/HIP non-blocking device queue 1D copy enqueue trait specialization.
534 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
535 struct Enqueue<
537 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
538 {
539 ALPAKA_FN_HOST static auto enqueue(
540 QueueUniformCudaHipRtNonBlocking<TApi>& queue,
541 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
542 -> void
543 {
545
546 task.enqueue(queue);
547 }
548 };
549
550 //! The CUDA/HIP blocking device queue 1D copy enqueue trait specialization.
551 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
552 struct Enqueue<
554 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
555 {
556 ALPAKA_FN_HOST static auto enqueue(
557 QueueUniformCudaHipRtBlocking<TApi>& queue,
558 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
559 -> void
560 {
562
563 task.enqueue(queue);
564
565 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
566 }
567 };
568
569 //! The CUDA/HIP non-blocking device queue 2D copy enqueue trait specialization.
570 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
571 struct Enqueue<
573 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
574 {
575 ALPAKA_FN_HOST static auto enqueue(
576 QueueUniformCudaHipRtNonBlocking<TApi>& queue,
577 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
578 -> void
579 {
581
582 task.enqueue(queue);
583 }
584 };
585
586 //! The CUDA/HIP blocking device queue 2D copy enqueue trait specialization.
587 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
588 struct Enqueue<
590 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
591 {
592 ALPAKA_FN_HOST static auto enqueue(
593 QueueUniformCudaHipRtBlocking<TApi>& queue,
594 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
595 -> void
596 {
598
599 task.enqueue(queue);
600
601 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
602 }
603 };
604
605 //! The CUDA/HIP non-blocking device queue 3D copy enqueue trait specialization.
606 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
607 struct Enqueue<
609 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
610 {
611 ALPAKA_FN_HOST static auto enqueue(
612 QueueUniformCudaHipRtNonBlocking<TApi>& queue,
613 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
614 -> void
615 {
617
618 task.enqueue(queue);
619 }
620 };
621
622 //! The CUDA/HIP blocking device queue 3D copy enqueue trait specialization.
623 template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
624 struct Enqueue<
626 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
627 {
628 ALPAKA_FN_HOST static auto enqueue(
629 QueueUniformCudaHipRtBlocking<TApi>& queue,
630 alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
631 -> void
632 {
634
635 task.enqueue(queue);
636
637 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
638 }
639 };
640 } // namespace trait
641} // namespace alpaka
642
643#endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition Assert.hpp:13
#define ALPAKA_DEBUG
Set the minimum log level if it is not defined.
Definition Debug.hpp:22
#define ALPAKA_DEBUG_FULL_LOG_SCOPE
Definition Debug.hpp:62
#define ALPAKA_DEBUG_FULL
The full debug level.
Definition Debug.hpp:18
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The alpaka accelerator library.
typename trait::IdxType< T >::type Idx
Definition Traits.hpp:29
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getExtentProduct(T const &object) -> Idx< T >
Definition Traits.hpp:134
uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, true > QueueUniformCudaHipRtBlocking
The CUDA/HIP RT blocking queue.
ALPAKA_FN_HOST auto getPitchesInBytes(TView const &view) -> Vec< Dim< TView >, Idx< TView > >
Definition Traits.hpp:196
ALPAKA_FN_HOST auto createTaskMemcpy(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent)
Creates a memory copy task.
Definition Traits.hpp:253
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getHeight(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:108
ALPAKA_FN_HOST auto getPtrNative(TView const &view) -> Elem< TView > const *
Gets the native pointer of the memory view.
Definition Traits.hpp:136
std::remove_volatile_t< typename trait::ElemType< TView >::type > Elem
The element type trait alias template to remove the ::type.
Definition Traits.hpp:21
ALPAKA_FN_HOST auto getDev(T const &t)
Definition Traits.hpp:68
ALPAKA_FN_HOST auto enqueue(TQueue &queue, TTask &&task) -> void
Queues the given task in the given queue.
Definition Traits.hpp:47
std::integral_constant< std::size_t, N > DimInt
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getDepth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWidth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition Traits.hpp:95
uniform_cuda_hip::detail::QueueUniformCudaHipRt< TApi, false > QueueUniformCudaHipRtNonBlocking
The CUDA/HIP RT non-blocking queue.
STL namespace.
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemCpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition Copy.hpp:42
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemCpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition Copy.hpp:104
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemcpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition Copy.hpp:184
ALPAKA_FN_HOST TaskCopyUniformCudaHip(TViewDstFwd &&viewDst, TViewSrc const &viewSrc, TExtent const &extent, typename TApi::MemcpyKind_t const &uniformMemcpyKind, int const &iDstDevice, int const &iSrcDevice)
Definition Copy.hpp:289
The CUDA/HIP memory copy trait.
Definition Copy.hpp:33