alpaka
Abstraction Library for Parallel Kernel Acceleration
AccGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2024 Benjamin Worpitz, RenĂ© Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Base classes.
23 
24 // Specialized traits.
25 #include "alpaka/acc/Traits.hpp"
26 #include "alpaka/dev/Traits.hpp"
27 #include "alpaka/idx/Traits.hpp"
28 #include "alpaka/kernel/Traits.hpp"
30 
31 // Implementation details.
32 #include "alpaka/core/ClipCast.hpp"
33 #include "alpaka/core/Cuda.hpp"
36 
37 #include <typeinfo>
38 
39 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
40 
41 namespace alpaka
42 {
43  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
44  class TaskKernelGpuUniformCudaHipRt;
45 
46  //! The GPU CUDA accelerator.
47  //!
48  //! This accelerator allows parallel kernel execution on devices supporting CUDA.
49  template<typename TApi, typename TDim, typename TIdx>
51  : public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>
52  , public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>
53  , public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>
54  , public AtomicHierarchy<
55  AtomicUniformCudaHipBuiltIn, // grid atomics
56  AtomicUniformCudaHipBuiltIn, // block atomics
57  AtomicUniformCudaHipBuiltIn> // thread atomics
64 # ifdef ALPAKA_DISABLE_VENDOR_RNG
65  , public rand::RandDefault
66 # else
67  , public rand::RandUniformCudaHipRand<TApi>
68 # endif
70  , public interface::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
71  {
72  static_assert(
73  sizeof(TIdx) >= sizeof(int),
74  "Index type is not supported, consider using int or a larger type.");
75 
76  public:
81 
83  : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
84  {
85  }
86  };
87 
88  namespace trait
89  {
90  //! The GPU CUDA accelerator accelerator type trait specialization.
91  template<typename TApi, typename TDim, typename TIdx>
92  struct AccType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
93  {
95  };
96 
97  //! The GPU CUDA single thread accelerator type trait specialization.
98  template<typename TApi, typename TDim, typename TIdx>
99  struct IsSingleThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::false_type
100  {
101  };
102 
103  //! The GPU CUDA multi thread accelerator type trait specialization.
104  template<typename TApi, typename TDim, typename TIdx>
105  struct IsMultiThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::true_type
106  {
107  };
108 
109  //! The GPU CUDA accelerator device properties get trait specialization.
110  template<typename TApi, typename TDim, typename TIdx>
111  struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
112  {
114  {
115 # ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
116  // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
117  // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
118  int multiProcessorCount = {};
119  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
120  &multiProcessorCount,
121  TApi::deviceAttributeMultiprocessorCount,
122  dev.getNativeHandle()));
123 
124  int maxGridSize[3] = {};
125  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
126  &maxGridSize[0],
127  TApi::deviceAttributeMaxGridDimX,
128  dev.getNativeHandle()));
129  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
130  &maxGridSize[1],
131  TApi::deviceAttributeMaxGridDimY,
132  dev.getNativeHandle()));
133  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
134  &maxGridSize[2],
135  TApi::deviceAttributeMaxGridDimZ,
136  dev.getNativeHandle()));
137 
138  int maxBlockDim[3] = {};
139  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
140  &maxBlockDim[0],
141  TApi::deviceAttributeMaxBlockDimX,
142  dev.getNativeHandle()));
143  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
144  &maxBlockDim[1],
145  TApi::deviceAttributeMaxBlockDimY,
146  dev.getNativeHandle()));
147  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
148  &maxBlockDim[2],
149  TApi::deviceAttributeMaxBlockDimZ,
150  dev.getNativeHandle()));
151 
152  int maxThreadsPerBlock = {};
153  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
154  &maxThreadsPerBlock,
155  TApi::deviceAttributeMaxThreadsPerBlock,
156  dev.getNativeHandle()));
157 
158  int sharedMemSizeBytes = {};
159  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
160  &sharedMemSizeBytes,
161  TApi::deviceAttributeMaxSharedMemoryPerBlock,
162  dev.getNativeHandle()));
163 
164  return {// m_multiProcessorCount
165  alpaka::core::clipCast<TIdx>(multiProcessorCount),
166  // m_gridBlockExtentMax
167  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
168  alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
169  alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
170  alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
171  // m_gridBlockCountMax
173  // m_blockThreadExtentMax
174  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
175  alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
176  alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
177  alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
178  // m_blockThreadCountMax
179  alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
180  // m_threadElemExtentMax
182  // m_threadElemCountMax
184  // m_sharedMemSizeBytes
185  static_cast<size_t>(sharedMemSizeBytes),
186  // m_globalMemSizeBytes
187  getMemBytes(dev)};
188 
189 # else
190  typename TApi::DeviceProp_t properties;
191  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&properties, dev.getNativeHandle()));
192 
193  return {// m_multiProcessorCount
194  alpaka::core::clipCast<TIdx>(properties.multiProcessorCount),
195  // m_gridBlockExtentMax
196  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
197  alpaka::core::clipCast<TIdx>(properties.maxGridSize[2u]),
198  alpaka::core::clipCast<TIdx>(properties.maxGridSize[1u]),
199  alpaka::core::clipCast<TIdx>(properties.maxGridSize[0u]))),
200  // m_gridBlockCountMax
202  // m_blockThreadExtentMax
203  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
204  alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[2u]),
205  alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[1u]),
206  alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[0u]))),
207  // m_blockThreadCountMax
208  alpaka::core::clipCast<TIdx>(properties.maxThreadsPerBlock),
209  // m_threadElemExtentMax
211  // m_threadElemCountMax
213  // m_sharedMemSizeBytes
214  static_cast<size_t>(properties.sharedMemPerBlock),
215  // m_globalMemSizeBytes
216  getMemBytes(dev)};
217 # endif
218  }
219  };
220 
221  //! The GPU CUDA accelerator name trait specialization.
222  template<typename TApi, typename TDim, typename TIdx>
223  struct GetAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
224  {
225  ALPAKA_FN_HOST static auto getAccName() -> std::string
226  {
227  return std::string("AccGpu") + TApi::name + "Rt<" + std::to_string(TDim::value) + ","
228  + core::demangled<TIdx> + ">";
229  }
230  };
231 
232  //! The GPU CUDA accelerator device type trait specialization.
233  template<typename TApi, typename TDim, typename TIdx>
234  struct DevType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
235  {
237  };
238 
239  //! The GPU CUDA accelerator dimension getter trait specialization.
240  template<typename TApi, typename TDim, typename TIdx>
241  struct DimType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
242  {
243  using type = TDim;
244  };
245  } // namespace trait
246 
247  namespace detail
248  {
249  //! specialization of the TKernelFnObj return type evaluation
250  //
251  // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
252  // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
253  // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
254  template<typename TApi, typename TDim, typename TIdx>
255  struct CheckFnReturnType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
256  {
257  template<typename TKernelFnObj, typename... TArgs>
258  void operator()(TKernelFnObj const&, TArgs const&...)
259  {
260  }
261  };
262  } // namespace detail
263 
264  namespace trait
265  {
266  //! The GPU CUDA accelerator execution task type trait specialization.
267  template<
268  typename TApi,
269  typename TDim,
270  typename TIdx,
271  typename TWorkDiv,
272  typename TKernelFnObj,
273  typename... TArgs>
274  struct CreateTaskKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
275  {
277  TWorkDiv const& workDiv,
278  TKernelFnObj const& kernelFnObj,
279  TArgs&&... args)
280  {
282  TApi,
284  TDim,
285  TIdx,
286  TKernelFnObj,
287  TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
288  }
289  };
290 
291  //! The CPU CUDA execution task platform type trait specialization.
292  template<typename TApi, typename TDim, typename TIdx>
293  struct PlatformType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
294  {
296  };
297 
298  //! The GPU CUDA accelerator idx type trait specialization.
299  template<typename TApi, typename TDim, typename TIdx>
300  struct IdxType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
301  {
302  using type = TIdx;
303  };
304  } // namespace trait
305 } // namespace alpaka
306 
307 #endif
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
ALPAKA_FN_HOST_ACC AccGpuUniformCudaHipRt(Vec< TDim, TIdx > const &threadElemExtent)
auto operator=(AccGpuUniformCudaHipRt &&) -> AccGpuUniformCudaHipRt &=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const &)=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt &&)=delete
auto operator=(AccGpuUniformCudaHipRt const &) -> AccGpuUniformCudaHipRt &=delete
The GPU CUDA/HIP block synchronization.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition: Vec.hpp:116
The GPU CUDA/HIP accelerator work division.
The CUDA/HIP accelerator ND index provider.
The CUDA/HIP accelerator ND index provider.
The standard library math trait specializations.
The CUDA/HIP rand implementation.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
#define ALPAKA_FN_HOST_ACC
Definition: Common.hpp:39
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto max(T const &max_ctx, Tx const &x, Ty const &y)
Returns the larger of two arguments. NaNs are treated as missing data (between a NaN and a numeric va...
Definition: Traits.hpp:1263
The alpaka accelerator library.
ALPAKA_FN_HOST auto getMemBytes(TDev const &dev) -> std::size_t
Definition: Traits.hpp:95
std::integral_constant< std::size_t, N > DimInt
The acceleration properties on a device.
Definition: AccDevProps.hpp:18
Check that the return of TKernelFnObj is void.
Definition: Traits.hpp:276
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition: Interface.hpp:15
The accelerator type trait.
Definition: Traits.hpp:37
static ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const &workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
The kernel execution task creation trait.
Definition: Traits.hpp:35
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto getAccDevProps(DevUniformCudaHipRt< TApi > const &dev) -> AccDevProps< TDim, TIdx >
The device properties get trait.
Definition: Traits.hpp:61
The accelerator name trait.
Definition: Traits.hpp:68
The idx type trait.
Definition: Traits.hpp:25
The multi thread accelerator trait.
Definition: Traits.hpp:56
The single thread accelerator trait.
Definition: Traits.hpp:46
The platform type trait.
Definition: Traits.hpp:30