alpaka
Abstraction Library for Parallel Kernel Acceleration
AccGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, RenĂ© Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 // Base classes.
23 
24 // Specialized traits.
25 #include "alpaka/acc/Traits.hpp"
26 #include "alpaka/dev/Traits.hpp"
27 #include "alpaka/idx/Traits.hpp"
28 #include "alpaka/kernel/Traits.hpp"
30 
31 // Implementation details.
32 #include "alpaka/core/ClipCast.hpp"
33 #include "alpaka/core/Concepts.hpp"
34 #include "alpaka/core/Cuda.hpp"
36 
37 #include <typeinfo>
38 
39 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
40 
41 namespace alpaka
42 {
43  template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
44  class TaskKernelGpuUniformCudaHipRt;
45 
46  //! The GPU CUDA accelerator.
47  //!
48  //! This accelerator allows parallel kernel execution on devices supporting CUDA.
49  template<typename TApi, typename TDim, typename TIdx>
51  : public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>
52  , public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>
53  , public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>
54  , public AtomicHierarchy<
55  AtomicUniformCudaHipBuiltIn, // grid atomics
56  AtomicUniformCudaHipBuiltIn, // block atomics
57  AtomicUniformCudaHipBuiltIn> // thread atomics
64 # ifdef ALPAKA_DISABLE_VENDOR_RNG
65  , public rand::RandDefault
66 # else
67  , public rand::RandUniformCudaHipRand<TApi>
68 # endif
70  , public concepts::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
71  {
72  static_assert(
73  sizeof(TIdx) >= sizeof(int),
74  "Index type is not supported, consider using int or a larger type.");
75 
76  public:
81 
83  : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
84  {
85  }
86  };
87 
88  namespace trait
89  {
90  //! The GPU CUDA accelerator accelerator type trait specialization.
91  template<typename TApi, typename TDim, typename TIdx>
92  struct AccType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
93  {
95  };
96 
97  //! The GPU CUDA accelerator device properties get trait specialization.
98  template<typename TApi, typename TDim, typename TIdx>
99  struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
100  {
102  {
103 # ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
104  // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
105  // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
106  int multiProcessorCount = {};
107  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
108  &multiProcessorCount,
109  TApi::deviceAttributeMultiprocessorCount,
110  dev.getNativeHandle()));
111 
112  int maxGridSize[3] = {};
113  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
114  &maxGridSize[0],
115  TApi::deviceAttributeMaxGridDimX,
116  dev.getNativeHandle()));
117  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
118  &maxGridSize[1],
119  TApi::deviceAttributeMaxGridDimY,
120  dev.getNativeHandle()));
121  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
122  &maxGridSize[2],
123  TApi::deviceAttributeMaxGridDimZ,
124  dev.getNativeHandle()));
125 
126  int maxBlockDim[3] = {};
127  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
128  &maxBlockDim[0],
129  TApi::deviceAttributeMaxBlockDimX,
130  dev.getNativeHandle()));
131  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
132  &maxBlockDim[1],
133  TApi::deviceAttributeMaxBlockDimY,
134  dev.getNativeHandle()));
135  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
136  &maxBlockDim[2],
137  TApi::deviceAttributeMaxBlockDimZ,
138  dev.getNativeHandle()));
139 
140  int maxThreadsPerBlock = {};
141  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
142  &maxThreadsPerBlock,
143  TApi::deviceAttributeMaxThreadsPerBlock,
144  dev.getNativeHandle()));
145 
146  int sharedMemSizeBytes = {};
147  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
148  &sharedMemSizeBytes,
149  TApi::deviceAttributeMaxSharedMemoryPerBlock,
150  dev.getNativeHandle()));
151 
152  return {// m_multiProcessorCount
153  alpaka::core::clipCast<TIdx>(multiProcessorCount),
154  // m_gridBlockExtentMax
155  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
156  alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
157  alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
158  alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
159  // m_gridBlockCountMax
161  // m_blockThreadExtentMax
162  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
163  alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
164  alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
165  alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
166  // m_blockThreadCountMax
167  alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
168  // m_threadElemExtentMax
170  // m_threadElemCountMax
172  // m_sharedMemSizeBytes
173  static_cast<size_t>(sharedMemSizeBytes),
174  // m_globalMemSizeBytes
175  getMemBytes(dev)};
176 
177 # else
178  typename TApi::DeviceProp_t properties;
179  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&properties, dev.getNativeHandle()));
180 
181  return {// m_multiProcessorCount
182  alpaka::core::clipCast<TIdx>(properties.multiProcessorCount),
183  // m_gridBlockExtentMax
184  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
185  alpaka::core::clipCast<TIdx>(properties.maxGridSize[2u]),
186  alpaka::core::clipCast<TIdx>(properties.maxGridSize[1u]),
187  alpaka::core::clipCast<TIdx>(properties.maxGridSize[0u]))),
188  // m_gridBlockCountMax
190  // m_blockThreadExtentMax
191  getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
192  alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[2u]),
193  alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[1u]),
194  alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[0u]))),
195  // m_blockThreadCountMax
196  alpaka::core::clipCast<TIdx>(properties.maxThreadsPerBlock),
197  // m_threadElemExtentMax
199  // m_threadElemCountMax
201  // m_sharedMemSizeBytes
202  static_cast<size_t>(properties.sharedMemPerBlock),
203  // m_globalMemSizeBytes
204  getMemBytes(dev)};
205 # endif
206  }
207  };
208 
209  //! The GPU CUDA accelerator name trait specialization.
210  template<typename TApi, typename TDim, typename TIdx>
211  struct GetAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
212  {
213  ALPAKA_FN_HOST static auto getAccName() -> std::string
214  {
215  return std::string("AccGpu") + TApi::name + "Rt<" + std::to_string(TDim::value) + ","
216  + core::demangled<TIdx> + ">";
217  }
218  };
219 
220  //! The GPU CUDA accelerator device type trait specialization.
221  template<typename TApi, typename TDim, typename TIdx>
222  struct DevType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
223  {
225  };
226 
227  //! The GPU CUDA accelerator dimension getter trait specialization.
228  template<typename TApi, typename TDim, typename TIdx>
229  struct DimType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
230  {
231  using type = TDim;
232  };
233  } // namespace trait
234 
235  namespace detail
236  {
237  //! specialization of the TKernelFnObj return type evaluation
238  //
239  // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
240  // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
241  // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
242  template<typename TApi, typename TDim, typename TIdx>
243  struct CheckFnReturnType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
244  {
245  template<typename TKernelFnObj, typename... TArgs>
246  void operator()(TKernelFnObj const&, TArgs const&...)
247  {
248  }
249  };
250  } // namespace detail
251 
252  namespace trait
253  {
254  //! The GPU CUDA accelerator execution task type trait specialization.
255  template<
256  typename TApi,
257  typename TDim,
258  typename TIdx,
259  typename TWorkDiv,
260  typename TKernelFnObj,
261  typename... TArgs>
262  struct CreateTaskKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
263  {
265  TWorkDiv const& workDiv,
266  TKernelFnObj const& kernelFnObj,
267  TArgs&&... args)
268  {
270  TApi,
272  TDim,
273  TIdx,
274  TKernelFnObj,
275  TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
276  }
277  };
278 
279  //! The CPU CUDA execution task platform type trait specialization.
280  template<typename TApi, typename TDim, typename TIdx>
281  struct PlatformType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
282  {
284  };
285 
286  //! The GPU CUDA accelerator idx type trait specialization.
287  template<typename TApi, typename TDim, typename TIdx>
288  struct IdxType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
289  {
290  using type = TIdx;
291  };
292  } // namespace trait
293 } // namespace alpaka
294 
295 #endif
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
ALPAKA_FN_HOST_ACC AccGpuUniformCudaHipRt(Vec< TDim, TIdx > const &threadElemExtent)
auto operator=(AccGpuUniformCudaHipRt &&) -> AccGpuUniformCudaHipRt &=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const &)=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt &&)=delete
auto operator=(AccGpuUniformCudaHipRt const &) -> AccGpuUniformCudaHipRt &=delete
The GPU CUDA/HIP block synchronization.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
ALPAKA_NO_HOST_ACC_WARNING static constexpr ALPAKA_FN_HOST_ACC auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition: Vec.hpp:116
The GPU CUDA/HIP accelerator work division.
The CUDA/HIP accelerator ND index provider.
The CUDA/HIP accelerator ND index provider.
The standard library math trait specializations.
The CUDA/HIP rand implementation.
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
#define ALPAKA_FN_HOST_ACC
Definition: Common.hpp:39
ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC auto max(T const &max_ctx, Tx const &x, Ty const &y)
Returns the larger of two arguments. NaNs are treated as missing data (between a NaN and a numeric va...
Definition: Traits.hpp:1263
The alpaka accelerator library.
ALPAKA_FN_HOST auto getMemBytes(TDev const &dev) -> std::size_t
Definition: Traits.hpp:95
std::integral_constant< std::size_t, N > DimInt
The acceleration properties on a device.
Definition: AccDevProps.hpp:18
Tag used in class inheritance hierarchies that describes that a specific concept (TConcept) is implem...
Definition: Concepts.hpp:15
Check that the return of TKernelFnObj is void.
Definition: Traits.hpp:233
The accelerator type trait.
Definition: Traits.hpp:37
static ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const &workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
The kernel execution task creation trait.
Definition: Traits.hpp:34
The device type trait.
Definition: Traits.hpp:23
The dimension getter type trait.
Definition: Traits.hpp:14
static ALPAKA_FN_HOST auto getAccDevProps(DevUniformCudaHipRt< TApi > const &dev) -> AccDevProps< TDim, TIdx >
The device properties get trait.
Definition: Traits.hpp:41
The accelerator name trait.
Definition: Traits.hpp:48
The idx type trait.
Definition: Traits.hpp:25
The platform type trait.
Definition: Traits.hpp:30