alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
AccGpuUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2025 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7// Base classes.
23
24// Specialized traits.
25#include "alpaka/acc/Traits.hpp"
26#include "alpaka/dev/Traits.hpp"
27#include "alpaka/idx/Traits.hpp"
30
31// Implementation details.
33#include "alpaka/core/Cuda.hpp"
36
37#ifdef __cpp_lib_format
38# include <format>
39#endif
40#include <string>
41
42#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
43
44namespace alpaka
45{
46 template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
47 class TaskKernelGpuUniformCudaHipRt;
48
49 //! The GPU CUDA accelerator.
50 //!
51 //! This accelerator allows parallel kernel execution on devices supporting CUDA.
52 template<typename TApi, typename TDim, typename TIdx>
54 : public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>
55 , public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>
56 , public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>
57 , public AtomicHierarchy<
58 AtomicUniformCudaHipBuiltIn, // grid atomics
59 AtomicUniformCudaHipBuiltIn, // block atomics
60 AtomicUniformCudaHipBuiltIn> // thread atomics
67# ifdef ALPAKA_DISABLE_VENDOR_RNG
68 , public rand::RandDefault
69# else
70 , public rand::RandUniformCudaHipRand<TApi>
71# endif
73 , public interface::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
74 {
75 static_assert(
76 sizeof(TIdx) >= sizeof(int),
77 "Index type is not supported, consider using int or a larger type.");
78
79 public:
84
86 : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
87 {
88 }
89 };
90
91 namespace trait
92 {
93 //! The GPU CUDA accelerator accelerator type trait specialization.
94 template<typename TApi, typename TDim, typename TIdx>
99
100 //! The GPU CUDA single thread accelerator type trait specialization.
101 template<typename TApi, typename TDim, typename TIdx>
102 struct IsSingleThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::false_type
103 {
104 };
105
106 //! The GPU CUDA multi thread accelerator type trait specialization.
107 template<typename TApi, typename TDim, typename TIdx>
108 struct IsMultiThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::true_type
109 {
110 };
111
112 //! The GPU CUDA accelerator device properties get trait specialization.
113 template<typename TApi, typename TDim, typename TIdx>
114 struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
115 {
117 {
118# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
119 // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
120 // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
121 int multiProcessorCount = {};
122 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
123 &multiProcessorCount,
124 TApi::deviceAttributeMultiprocessorCount,
125 dev.getNativeHandle()));
126
127 int maxGridSize[3] = {};
128 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
129 &maxGridSize[0],
130 TApi::deviceAttributeMaxGridDimX,
131 dev.getNativeHandle()));
132 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
133 &maxGridSize[1],
134 TApi::deviceAttributeMaxGridDimY,
135 dev.getNativeHandle()));
136 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
137 &maxGridSize[2],
138 TApi::deviceAttributeMaxGridDimZ,
139 dev.getNativeHandle()));
140
141 int maxBlockDim[3] = {};
142 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
143 &maxBlockDim[0],
144 TApi::deviceAttributeMaxBlockDimX,
145 dev.getNativeHandle()));
146 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
147 &maxBlockDim[1],
148 TApi::deviceAttributeMaxBlockDimY,
149 dev.getNativeHandle()));
150 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
151 &maxBlockDim[2],
152 TApi::deviceAttributeMaxBlockDimZ,
153 dev.getNativeHandle()));
154
155 int maxThreadsPerBlock = {};
156 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
157 &maxThreadsPerBlock,
158 TApi::deviceAttributeMaxThreadsPerBlock,
159 dev.getNativeHandle()));
160
161 int sharedMemSizeBytes = {};
162 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
163 &sharedMemSizeBytes,
164 TApi::deviceAttributeMaxSharedMemoryPerBlock,
165 dev.getNativeHandle()));
166
167 return {// m_multiProcessorCount
168 alpaka::core::clipCast<TIdx>(multiProcessorCount),
169 // m_gridBlockExtentMax
170 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
171 alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
172 alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
173 alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
174 // m_gridBlockCountMax
175 std::numeric_limits<TIdx>::max(),
176 // m_blockThreadExtentMax
177 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
178 alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
179 alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
180 alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
181 // m_blockThreadCountMax
182 alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
183 // m_threadElemExtentMax
184 Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
185 // m_threadElemCountMax
186 std::numeric_limits<TIdx>::max(),
187 // m_sharedMemSizeBytes
188 static_cast<size_t>(sharedMemSizeBytes),
189 // m_globalMemSizeBytes
190 getMemBytes(dev)};
191
192# else
193 typename TApi::DeviceProp_t properties;
194 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&properties, dev.getNativeHandle()));
195
196 return {// m_multiProcessorCount
197 alpaka::core::clipCast<TIdx>(properties.multiProcessorCount),
198 // m_gridBlockExtentMax
199 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
200 alpaka::core::clipCast<TIdx>(properties.maxGridSize[2u]),
201 alpaka::core::clipCast<TIdx>(properties.maxGridSize[1u]),
202 alpaka::core::clipCast<TIdx>(properties.maxGridSize[0u]))),
203 // m_gridBlockCountMax
204 std::numeric_limits<TIdx>::max(),
205 // m_blockThreadExtentMax
206 getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
207 alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[2u]),
208 alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[1u]),
209 alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[0u]))),
210 // m_blockThreadCountMax
211 alpaka::core::clipCast<TIdx>(properties.maxThreadsPerBlock),
212 // m_threadElemExtentMax
213 Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
214 // m_threadElemCountMax
215 std::numeric_limits<TIdx>::max(),
216 // m_sharedMemSizeBytes
217 static_cast<size_t>(properties.sharedMemPerBlock),
218 // m_globalMemSizeBytes
219 getMemBytes(dev)};
220# endif
221 }
222 };
223
224 //! The GPU CUDA accelerator name trait specialization.
225 template<typename TApi, typename TDim, typename TIdx>
226 struct GetAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
227 {
228 ALPAKA_FN_HOST static auto getAccName() -> std::string
229 {
230# if ALPAKA_COMP_CLANG
231# pragma clang diagnostic push
232# pragma clang diagnostic ignored "-Wexit-time-destructors"
233# endif
234 using namespace std::literals;
235 static std::string const accName =
236# ifdef __cpp_lib_format
237 std::format("AccGpu{}Rt<{},{}>", TApi::name, TDim::value, core::demangled<TIdx>);
238# else
239 "AccGpu"s + TApi::name + "Rt<"s + std::to_string(TDim::value) + ","s
240 + std::string(core::demangled<TIdx>) + ">"s;
241# endif
242 return accName;
243# if ALPAKA_COMP_CLANG
244# pragma clang diagnostic pop
245# endif
246 }
247 };
248
249 //! The GPU CUDA accelerator device type trait specialization.
250 template<typename TApi, typename TDim, typename TIdx>
251 struct DevType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
252 {
254 };
255
256 //! The GPU CUDA accelerator dimension getter trait specialization.
257 template<typename TApi, typename TDim, typename TIdx>
258 struct DimType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
259 {
260 using type = TDim;
261 };
262 } // namespace trait
263
264 namespace detail
265 {
266 //! specialization of the TKernelFnObj return type evaluation
267 //
268 // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
269 // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
270 // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
271 template<typename TApi, typename TDim, typename TIdx>
273 {
274 template<typename TKernelFnObj, typename... TArgs>
275 void operator()(TKernelFnObj const&, TArgs const&...)
276 {
277 }
278 };
279 } // namespace detail
280
281 namespace trait
282 {
283 //! The GPU CUDA accelerator execution task type trait specialization.
284 template<
285 typename TApi,
286 typename TDim,
287 typename TIdx,
288 typename TWorkDiv,
289 typename TKernelFnObj,
290 typename... TArgs>
291 struct CreateTaskKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
292 {
294 TWorkDiv const& workDiv,
295 TKernelFnObj const& kernelFnObj,
296 TArgs&&... args)
297 {
299 TApi,
301 TDim,
302 TIdx,
303 TKernelFnObj,
304 TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
305 }
306 };
307
308 //! The CPU CUDA execution task platform type trait specialization.
309 template<typename TApi, typename TDim, typename TIdx>
310 struct PlatformType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
311 {
313 };
314
315 //! The GPU CUDA accelerator idx type trait specialization.
316 template<typename TApi, typename TDim, typename TIdx>
317 struct IdxType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
318 {
319 using type = TIdx;
320 };
321 } // namespace trait
322} // namespace alpaka
323
324#endif
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
ALPAKA_FN_HOST_ACC AccGpuUniformCudaHipRt(Vec< TDim, TIdx > const &threadElemExtent)
auto operator=(AccGpuUniformCudaHipRt &&) -> AccGpuUniformCudaHipRt &=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const &)=delete
AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt &&)=delete
auto operator=(AccGpuUniformCudaHipRt const &) -> AccGpuUniformCudaHipRt &=delete
The GPU CUDA/HIP block synchronization.
The CUDA/HIP RT device handle.
The GPU CUDA/HIP accelerator execution task.
A n-dimensional vector.
Definition Vec.hpp:38
ALPAKA_NO_HOST_ACC_WARNING static ALPAKA_FN_HOST_ACC constexpr auto all(TVal const &val) -> Vec< TDim, TVal >
Single value constructor.
Definition Vec.hpp:89
The GPU CUDA/HIP accelerator work division.
The CUDA/HIP accelerator ND index provider.
The CUDA/HIP accelerator ND index provider.
The standard library math trait specializations.
The CUDA/HIP rand implementation.
#define ALPAKA_FN_HOST
Definition Common.hpp:40
#define ALPAKA_FN_HOST_ACC
Definition Common.hpp:39
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
The alpaka accelerator library.
ALPAKA_FN_HOST auto getMemBytes(TDev const &dev) -> std::size_t
Definition Traits.hpp:95
std::integral_constant< std::size_t, N > DimInt
The acceleration properties on a device.
Check that the return of TKernelFnObj is void.
Definition Traits.hpp:276
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
The accelerator type trait.
Definition Traits.hpp:37
static ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const &workDiv, TKernelFnObj const &kernelFnObj, TArgs &&... args)
The kernel execution task creation trait.
Definition Traits.hpp:35
The device type trait.
Definition Traits.hpp:23
The dimension getter type trait.
Definition Traits.hpp:14
static ALPAKA_FN_HOST auto getAccDevProps(DevUniformCudaHipRt< TApi > const &dev) -> AccDevProps< TDim, TIdx >
The device properties get trait.
Definition Traits.hpp:61
The accelerator name trait.
Definition Traits.hpp:68
The idx type trait.
Definition Traits.hpp:25
The multi thread accelerator trait.
Definition Traits.hpp:56
The single thread accelerator trait.
Definition Traits.hpp:46
The platform type trait.
Definition Traits.hpp:30