alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
AccGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2024 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7// Base classes.
23
24// Specialized traits.
25#include "alpaka/acc/Traits.hpp"
26#include "alpaka/dev/Traits.hpp"
27#include "alpaka/idx/Traits.hpp"
30
31// Implementation details.
33#include "alpaka/core/Cuda.hpp"
36
37#include <typeinfo>
38
39#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
40
41namespace alpaka
42{
43 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
44 class TaskKernelGpuUniformCudaHipRt;
45
46 //! The GPU CUDA accelerator.
47 //!
48 //! This accelerator allows parallel kernel execution on devices supporting CUDA.
49 template<typename TApi, typename TDim, typename TIdx>
51 : public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>
52 , public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>
53 , public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>
54 , public AtomicHierarchy<
55 AtomicUniformCudaHipBuiltIn, // grid atomics
56 AtomicUniformCudaHipBuiltIn, // block atomics
57 AtomicUniformCudaHipBuiltIn> // thread atomics
64# ifdef ALPAKA_DISABLE_VENDOR_RNG
65 , public rand::RandDefault
66# else
67 , public rand::RandUniformCudaHipRand<TApi>
68# endif
70 , public interface::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
71 {
72 static_assert(
73 sizeof(TIdx) >= sizeof(int),
74 "Index type is not supported, consider using int or a larger type.");
75
76 public:
81
83 : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
84 {
85 }
86 };
87
88 namespace trait
89 {
90 //! The GPU CUDA accelerator accelerator type trait specialization.
91 template<typename TApi, typename TDim, typename TIdx>
96
97 //! The GPU CUDA single thread accelerator type trait specialization.
98 template<typename TApi, typename TDim, typename TIdx>
99 struct IsSingleThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::false_type
100 {
101 };
102
103 //! The GPU CUDA multi thread accelerator type trait specialization.
104 template<typename TApi, typename TDim, typename TIdx>
105 struct IsMultiThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::true_type
106 {
107 };
108
109 //! The GPU CUDA accelerator device properties get trait specialization.
110 template<typename TApi, typename TDim, typename TIdx>
111 struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
112 {
114 {
115# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
116 // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
117 // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
118 int multiProcessorCount = {};
119 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
120 &multiProcessorCount,
121 TApi::deviceAttributeMultiprocessorCount,
122 dev.getNativeHandle()));
123
124 int maxGridSize[3] = {};
125 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
126 &maxGridSize[0],
127 TApi::deviceAttributeMaxGridDimX,
128 dev.getNativeHandle()));
129 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
130 &maxGridSize[1],
131 TApi::deviceAttributeMaxGridDimY,
132 dev.getNativeHandle()));
133 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
134 &maxGridSize[2],
135 TApi::deviceAttributeMaxGridDimZ,
136 dev.getNativeHandle()));
137
138 int maxBlockDim[3] = {};
139 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
140 &maxBlockDim[0],
141 TApi::deviceAttributeMaxBlockDimX,
142 dev.getNativeHandle()));
143 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
144 &maxBlockDim[1],
145 TApi::deviceAttributeMaxBlockDimY,
146 dev.getNativeHandle()));
147 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
148 &maxBlockDim[2],
149 TApi::deviceAttributeMaxBlockDimZ,
150 dev.getNativeHandle()));
151
152 int maxThreadsPerBlock = {};
153 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
154 &maxThreadsPerBlock,
155 TApi::deviceAttributeMaxThreadsPerBlock,
156 dev.getNativeHandle()));
157
158 int sharedMemSizeBytes = {};
159 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
160 &sharedMemSizeBytes,
161 TApi::deviceAttributeMaxSharedMemoryPerBlock,
162 dev.getNativeHandle()));
163
164 return {// m_multiProcessorCount
165 alpaka::core::clipCast<TIdx>(multiProcessorCount),
166 // m_gridBlockExtentMax
167 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
168 alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
169 alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
170 alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
171 // m_gridBlockCountMax
172 std::numeric_limits<TIdx>::max(),
173 // m_blockThreadExtentMax
174 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
175 alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
176 alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
177 alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
178 // m_blockThreadCountMax
179 alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
180 // m_threadElemExtentMax
181 Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
182 // m_threadElemCountMax
183 std::numeric_limits<TIdx>::max(),
184 // m_sharedMemSizeBytes
185 static_cast<size_t>(sharedMemSizeBytes),
186 // m_globalMemSizeBytes
187 getMemBytes(dev)};
188
189# else
190 typename TApi::DeviceProp_t properties;
191 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&properties, dev.getNativeHandle()));
192
193 return {// m_multiProcessorCount
194 alpaka::core::clipCast<TIdx>(properties.multiProcessorCount),
195 // m_gridBlockExtentMax
196 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
197 alpaka::core::clipCast<TIdx>(properties.maxGridSize[2u]),
198 alpaka::core::clipCast<TIdx>(properties.maxGridSize[1u]),
199 alpaka::core::clipCast<TIdx>(properties.maxGridSize[0u]))),
200 // m_gridBlockCountMax
201 std::numeric_limits<TIdx>::max(),
202 // m_blockThreadExtentMax
203 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
204 alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[2u]),
205 alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[1u]),
206 alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[0u]))),
207 // m_blockThreadCountMax
208 alpaka::core::clipCast<TIdx>(properties.maxThreadsPerBlock),
209 // m_threadElemExtentMax
210 Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
211 // m_threadElemCountMax
212 std::numeric_limits<TIdx>::max(),
213 // m_sharedMemSizeBytes
214 static_cast<size_t>(properties.sharedMemPerBlock),
215 // m_globalMemSizeBytes
216 getMemBytes(dev)};
217# endif
218 }
219 };
220
221 //! The GPU CUDA accelerator name trait specialization.
222 template<typename TApi, typename TDim, typename TIdx>
223 struct GetAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
224 {
225 ALPAKA_FN_HOST static auto getAccName() -> std::string
226 {
227 return std::string("AccGpu") + TApi::name + "Rt<" + std::to_string(TDim::value) + ","
228 + core::demangled<TIdx> + ">";
229 }
230 };
231
232 //! The GPU CUDA accelerator device type trait specialization.
233 template<typename TApi, typename TDim, typename TIdx>
234 struct DevType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
235 {
237 };
238
239 //! The GPU CUDA accelerator dimension getter trait specialization.
240 template<typename TApi, typename TDim, typename TIdx>
241 struct DimType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
242 {
243 using type = TDim;
244 };
245 } // namespace trait
246
247 namespace detail
248 {
249 //! specialization of the TKernelFnObj return type evaluation
250 //
251 // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
252 // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
253 // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
254 template<typename TApi, typename TDim, typename TIdx>
256 {
257 template<typename TKernelFnObj, typename... TArgs>
258 void operator()(TKernelFnObj const&, TArgs const&...)
259 {
260 }
261 };
262 } // namespace detail
263
264 namespace trait
265 {
266 //! The GPU CUDA accelerator execution task type trait specialization.
267 template<
268 typename TApi,
269 typename TDim,
270 typename TIdx,
271 typename TWorkDiv,
272 typename TKernelFnObj,
273 typename... TArgs>
274 struct CreateTaskKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
275 {
277 TWorkDiv const& workDiv,
278 TKernelFnObj const& kernelFnObj,
279 TArgs&&... args)
280 {
282 TApi,
284 TDim,
285 TIdx,
286 TKernelFnObj,
287 TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
288 }
289 };
290
291 //! The CPU CUDA execution task platform type trait specialization.
292 template<typename TApi, typename TDim, typename TIdx>
293 struct PlatformType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
294 {
296 };
297
298 //! The GPU CUDA accelerator idx type trait specialization.
299 template<typename TApi, typename TDim, typename TIdx>
300 struct IdxType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
301 {
302 using type = TIdx;
303 };
304 } // namespace trait
305} // namespace alpaka
306
307#endif
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
ALPAKA_FN_HOST_ACC AccGpuUniformCudaHipRt(Vec< TDim, TIdx > const &threadElemExtent)
auto operator=(AccGpuUniformCudaHipRt &&) -> AccGpuUniformCudaHipRt &=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const &)=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt &&)=delete
auto operator=(AccGpuUniformCudaHipRt const &) -> AccGpuUniformCudaHipRt &=delete
The GPU CUDA/HIP block synchronization.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition Vec.hpp:89
The GPU CUDA/HIP accelerator work division.
The CUDA/HIP accelerator ND index provider.
The CUDA/HIP accelerator ND index provider.
The standard library math trait specializations.
The CUDA/HIP rand implementation.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
#define ALPAKA_FN_HOST_ACC
Definition Common.hpp:39
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
The alpaka accelerator library.
ALPAKA_FN_HOST auto getMemBytes(TDev const &dev) -> std::size_t
Definition Traits.hpp:95
std::integral_constant< std::size_t, N > DimInt
The acceleration properties on a device.
Check that the return of TKernelFnObj is void.
Definition Traits.hpp:276
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
The accelerator type trait.
Definition Traits.hpp:37
static ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const &workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
The kernel execution task creation trait.
Definition Traits.hpp:35
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
static ALPAKA_FN_HOST auto getAccDevProps(DevUniformCudaHipRt< TApi > const &dev) -> AccDevProps< TDim, TIdx >
The device properties get trait.
Definition Traits.hpp:61
The accelerator name trait.
Definition Traits.hpp:68
The idx type trait.
Definition Traits.hpp:25
The multi thread accelerator trait.
Definition Traits.hpp:56
The single thread accelerator trait.
Definition Traits.hpp:46
The platform type trait.
Definition Traits.hpp:30