alpaka
Abstraction Library for Parallel Kernel Acceleration
Set.hpp
Go to the documentation of this file.
1 /* Copyright 2023 Benjamin Worpitz, Erik Zenker, Matthias Werner, RenĂ© Widera, Andrea Bocci, Bernhard Manfred Gruber,
2  * Antonio Di Pilato, Jan Stephan
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
8 #include "alpaka/core/Assert.hpp"
9 #include "alpaka/core/Cuda.hpp"
10 #include "alpaka/core/Hip.hpp"
11 #include "alpaka/dev/Traits.hpp"
13 #include "alpaka/extent/Traits.hpp"
17 #include "alpaka/queue/Traits.hpp"
18 #include "alpaka/wait/Traits.hpp"
19 
20 #include <cstddef>
21 
22 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
23 
24 namespace alpaka
25 {
26  template<typename TApi>
27  class DevUniformCudaHipRt;
28 
29  namespace detail
30  {
31  //! The CUDA/HIP memory set task base.
32  template<typename TApi, typename TDim, typename TView, typename TExtent>
34  {
35  TaskSetUniformCudaHipBase(TView& view, std::uint8_t const& byte, TExtent const& extent)
36  : m_view(view)
37  , m_byte(byte)
38  , m_extent(extent)
40  {
41  }
42 
43  protected:
44  TView& m_view;
45  std::uint8_t const m_byte;
46  TExtent const m_extent;
47  std::int32_t const m_iDevice;
48  };
49 
50  //! The CUDA/HIP memory set task.
51  template<typename TApi, typename TDim, typename TView, typename TExtent>
53 
54  //! The scalar CUDA/HIP memory set task.
55  template<typename TApi, typename TView, typename TExtent>
56  struct TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>
57  : public TaskSetUniformCudaHipBase<TApi, DimInt<0u>, TView, TExtent>
58  {
59  template<typename TViewFwd>
60  TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
61  : TaskSetUniformCudaHipBase<TApi, DimInt<0u>, TView, TExtent>(
62  std::forward<TViewFwd>(view),
63  byte,
64  extent)
65  {
66  }
67 
68  template<typename TQueue>
69  auto enqueue(TQueue& queue) const -> void
70  {
71  // Initiate the memory set.
72  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memsetAsync(
73  getPtrNative(this->m_view),
74  static_cast<int>(this->m_byte),
75  sizeof(Elem<TView>),
76  queue.getNativeHandle()));
77  }
78  };
79 
80  //! The 1D CUDA/HIP memory set task.
81  template<typename TApi, typename TView, typename TExtent>
82  struct TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>
83  : public TaskSetUniformCudaHipBase<TApi, DimInt<1u>, TView, TExtent>
84  {
85  template<typename TViewFwd>
86  TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
87  : TaskSetUniformCudaHipBase<TApi, DimInt<1u>, TView, TExtent>(
88  std::forward<TViewFwd>(view),
89  byte,
90  extent)
91  {
92  }
93 
94  template<typename TQueue>
95  auto enqueue(TQueue& queue) const -> void
96  {
97  auto& view = this->m_view;
98  auto const& extent = this->m_extent;
99 
100  auto const extentWidth = getWidth(extent);
101  ALPAKA_ASSERT(extentWidth <= getWidth(view));
102 
103  if(extentWidth == 0)
104  {
105  return;
106  }
107 
108  // Initiate the memory set.
109  auto const extentWidthBytes = static_cast<std::size_t>(extentWidth) * sizeof(Elem<TView>);
110  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memsetAsync(
111  getPtrNative(view),
112  static_cast<int>(this->m_byte),
113  extentWidthBytes,
114  queue.getNativeHandle()));
115  }
116  };
117 
118  //! The 2D CUDA/HIP memory set task.
119  template<typename TApi, typename TView, typename TExtent>
120  struct TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>
121  : public TaskSetUniformCudaHipBase<TApi, DimInt<2u>, TView, TExtent>
122  {
123  template<typename TViewFwd>
124  TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
125  : TaskSetUniformCudaHipBase<TApi, DimInt<2u>, TView, TExtent>(
126  std::forward<TViewFwd>(view),
127  byte,
128  extent)
129  {
130  }
131 
132  template<typename TQueue>
133  auto enqueue(TQueue& queue) const -> void
134  {
135  auto& view = this->m_view;
136  auto const& extent = this->m_extent;
137 
138  auto const extentWidth = getWidth(extent);
139  auto const extentHeight = getHeight(extent);
140 
141  if(extentWidth == 0 || extentHeight == 0)
142  {
143  return;
144  }
145 
146  auto const extentWidthBytes = static_cast<std::size_t>(extentWidth) * sizeof(Elem<TView>);
147 
148 # if !defined(NDEBUG)
149  auto const dstWidth = getWidth(view);
150  auto const dstHeight = getHeight(view);
151 # endif
152  auto const dstRowPitchBytes = static_cast<std::size_t>(getPitchesInBytes(view)[0]);
153  auto const dstNativePtr = reinterpret_cast<void*>(getPtrNative(view));
154  ALPAKA_ASSERT(extentWidth <= dstWidth);
155  ALPAKA_ASSERT(extentHeight <= dstHeight);
156 
157  // Initiate the memory set.
158  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset2DAsync(
159  dstNativePtr,
160  dstRowPitchBytes,
161  static_cast<int>(this->m_byte),
162  extentWidthBytes,
163  static_cast<std::size_t>(extentHeight),
164  queue.getNativeHandle()));
165  }
166  };
167 
168  //! The 3D CUDA/HIP memory set task.
169  template<typename TApi, typename TView, typename TExtent>
170  struct TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>
171  : public TaskSetUniformCudaHipBase<TApi, DimInt<3u>, TView, TExtent>
172  {
173  template<typename TViewFwd>
174  TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
175  : TaskSetUniformCudaHipBase<TApi, DimInt<3u>, TView, TExtent>(
176  std::forward<TViewFwd>(view),
177  byte,
178  extent)
179  {
180  }
181 
182  template<typename TQueue>
183  auto enqueue(TQueue& queue) const -> void
184  {
185  using Elem = alpaka::Elem<TView>;
186 
187  auto& view = this->m_view;
188  auto const& extent = this->m_extent;
189 
190  auto const extentWidth = getWidth(extent);
191  auto const extentHeight = getHeight(extent);
192  auto const extentDepth = getDepth(extent);
193 
194  // This is not only an optimization but also prevents a division by zero.
195  if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
196  {
197  return;
198  }
199 
200  auto const dstWidth = getWidth(view);
201 # if !defined(NDEBUG)
202  auto const dstHeight = getHeight(view);
203  auto const dstDepth = getDepth(view);
204 # endif
205  auto const [dstSlicePitchBytes, dstRowPitchBytes, _] = getPitchesInBytes(view);
206  auto const dstNativePtr = reinterpret_cast<void*>(getPtrNative(view));
207  ALPAKA_ASSERT(extentWidth <= dstWidth);
208  ALPAKA_ASSERT(extentHeight <= dstHeight);
209  ALPAKA_ASSERT(extentDepth <= dstDepth);
210 
211  // Fill CUDA parameter structures.
212  typename TApi::PitchedPtr_t const pitchedPtrVal = TApi::makePitchedPtr(
213  dstNativePtr,
214  static_cast<std::size_t>(dstRowPitchBytes),
215  static_cast<std::size_t>(dstWidth) * sizeof(Elem),
216  static_cast<std::size_t>(dstSlicePitchBytes / dstRowPitchBytes));
217 
218  typename TApi::Extent_t const extentVal = TApi::makeExtent(
219  static_cast<std::size_t>(extentWidth) * sizeof(Elem),
220  static_cast<std::size_t>(extentHeight),
221  static_cast<std::size_t>(extentDepth));
222 
223  // Initiate the memory set.
224  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset3DAsync(
225  pitchedPtrVal,
226  static_cast<int>(this->m_byte),
227  extentVal,
228  queue.getNativeHandle()));
229  }
230  };
231  } // namespace detail
232 
233  namespace trait
234  {
235  //! The CUDA device memory set trait specialization.
236  template<typename TApi, typename TDim>
238  {
239  template<typename TExtent, typename TView>
240  ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
242  {
244  }
245  };
246 
247  //! The CUDA non-blocking device queue scalar set enqueue trait specialization.
248  template<typename TApi, typename TView, typename TExtent>
249  struct Enqueue<
251  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>>
252  {
253  ALPAKA_FN_HOST static auto enqueue(
255  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent> const& task) -> void
256  {
258 
259  task.enqueue(queue);
260  }
261  };
262 
263  //! The CUDA blocking device queue scalar set enqueue trait specialization.
264  template<typename TApi, typename TView, typename TExtent>
265  struct Enqueue<
267  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>>
268  {
269  ALPAKA_FN_HOST static auto enqueue(
271  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent> const& task) -> void
272  {
274 
275  task.enqueue(queue);
276 
277  wait(queue);
278  }
279  };
280 
281  //! The CUDA non-blocking device queue 1D set enqueue trait specialization.
282  template<typename TApi, typename TView, typename TExtent>
283  struct Enqueue<
285  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>>
286  {
287  ALPAKA_FN_HOST static auto enqueue(
289  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent> const& task) -> void
290  {
292 
293  task.enqueue(queue);
294  }
295  };
296 
297  //! The CUDA blocking device queue 1D set enqueue trait specialization.
298  template<typename TApi, typename TView, typename TExtent>
299  struct Enqueue<
301  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>>
302  {
303  ALPAKA_FN_HOST static auto enqueue(
305  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent> const& task) -> void
306  {
308 
309  task.enqueue(queue);
310 
311  wait(queue);
312  }
313  };
314 
315  //! The CUDA non-blocking device queue 2D set enqueue trait specialization.
316  template<typename TApi, typename TView, typename TExtent>
317  struct Enqueue<
319  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>>
320  {
321  ALPAKA_FN_HOST static auto enqueue(
323  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent> const& task) -> void
324  {
326 
327  task.enqueue(queue);
328  }
329  };
330 
331  //! The CUDA blocking device queue 2D set enqueue trait specialization.
332  template<typename TApi, typename TView, typename TExtent>
333  struct Enqueue<
335  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>>
336  {
337  ALPAKA_FN_HOST static auto enqueue(
339  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent> const& task) -> void
340  {
342 
343  task.enqueue(queue);
344 
345  wait(queue);
346  }
347  };
348 
349  //! The CUDA non-blocking device queue 3D set enqueue trait specialization.
350  template<typename TApi, typename TView, typename TExtent>
351  struct Enqueue<
353  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>>
354  {
355  ALPAKA_FN_HOST static auto enqueue(
357  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent> const& task) -> void
358  {
360 
361  task.enqueue(queue);
362  }
363  };
364 
365  //! The CUDA blocking device queue 3D set enqueue trait specialization.
366  template<typename TApi, typename TView, typename TExtent>
367  struct Enqueue<
369  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>>
370  {
371  ALPAKA_FN_HOST static auto enqueue(
373  alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent> const& task) -> void
374  {
376 
377  task.enqueue(queue);
378 
379  wait(queue);
380  }
381  };
382  } // namespace trait
383 } // namespace alpaka
384 
385 #endif
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition: Assert.hpp:13
#define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
Definition: Debug.hpp:55
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
The alpaka accelerator library.
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getHeight(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:108
ALPAKA_FN_HOST auto getPitchesInBytes(TView const &view) -> Vec< Dim< TView >, Idx< TView >>
Definition: Traits.hpp:196
ALPAKA_FN_HOST auto getPtrNative(TView const &view) -> Elem< TView > const *
Gets the native pointer of the memory view.
Definition: Traits.hpp:136
std::remove_volatile_t< typename trait::ElemType< TView >::type > Elem
The element type trait alias template to remove the ::type.
Definition: Traits.hpp:21
ALPAKA_FN_HOST auto getDev(T const &t)
Definition: Traits.hpp:68
ALPAKA_FN_HOST auto getNativeHandle(TImpl const &impl)
Get the native handle of the alpaka object. It will return the alpaka object handle if there is any,...
Definition: Traits.hpp:29
std::integral_constant< std::size_t, N > DimInt
ALPAKA_FN_HOST auto wait(TAwaited const &awaited) -> void
Waits the thread for the completion of the given awaited action to complete.
Definition: Traits.hpp:34
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getDepth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:121
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto getWidth(TExtent const &extent=TExtent()) -> Idx< TExtent >
Definition: Traits.hpp:95
The CUDA/HIP memory set task base.
Definition: Set.hpp:34
TaskSetUniformCudaHipBase(TView &view, std::uint8_t const &byte, TExtent const &extent)
Definition: Set.hpp:35
TaskSetUniformCudaHip(TViewFwd &&view, std::uint8_t const &byte, TExtent const &extent)
Definition: Set.hpp:60
TaskSetUniformCudaHip(TViewFwd &&view, std::uint8_t const &byte, TExtent const &extent)
Definition: Set.hpp:86
TaskSetUniformCudaHip(TViewFwd &&view, std::uint8_t const &byte, TExtent const &extent)
Definition: Set.hpp:124
TaskSetUniformCudaHip(TViewFwd &&view, std::uint8_t const &byte, TExtent const &extent)
Definition: Set.hpp:174
The CUDA/HIP memory set task.
Definition: Set.hpp:52
static ALPAKA_FN_HOST auto createTaskMemset(TView &view, std::uint8_t const &byte, TExtent const &extent) -> alpaka::detail::TaskSetUniformCudaHip< TApi, TDim, TView, TExtent >
Definition: Set.hpp:240
The memory set task trait.
Definition: Traits.hpp:114
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 0u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:269
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 3u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:371
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 1u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:303
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 2u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:337
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 1u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:287
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 2u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:321
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 0u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:253
static ALPAKA_FN_HOST auto enqueue(QueueUniformCudaHipRtNonBlocking< TApi > &queue, alpaka::detail::TaskSetUniformCudaHip< TApi, DimInt< 3u >, TView, TExtent > const &task) -> void
Definition: Set.hpp:355
The queue enqueue trait.
Definition: Traits.hpp:27