alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
PlatformUniformCudaHipRt.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato,
2 * Christian Kaever
3 * SPDX-License-Identifier: MPL-2.0
4 */
5
6#pragma once
7
9#include "alpaka/core/Hip.hpp"
12#include "alpaka/dev/Traits.hpp"
13
14#include <iostream>
15#include <sstream>
16#include <stdexcept>
17#include <tuple>
18
19#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
20
21namespace alpaka
22{
23 // Forward declarations.
24 struct ApiCudaRt;
25 struct ApiHipRt;
26
27 //! The CUDA/HIP RT platform.
28 template<typename TApi>
29 struct PlatformUniformCudaHipRt : interface::Implements<ConceptPlatform, PlatformUniformCudaHipRt<TApi>>
30 {
31# if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \
32 && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0)
33 // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295
34 // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized"
35 char c = {};
36# endif
37 };
38
39 namespace trait
40 {
41 //! The CUDA/HIP RT platform device type trait specialization.
42 template<typename TApi>
47
48 //! The CUDA/HIP RT platform device count get trait specialization.
49 template<typename TApi>
51 {
53 {
55
56 int iNumDevices(0);
57 typename TApi::Error_t error = TApi::getDeviceCount(&iNumDevices);
58 if(error != TApi::success)
59 iNumDevices = 0;
60
61 return static_cast<std::size_t>(iNumDevices);
62 }
63 };
64
65 //! The CUDA/HIP RT platform device get trait specialization.
66 template<typename TApi>
68 {
70 PlatformUniformCudaHipRt<TApi> const& platform,
71 std::size_t const& devIdx) -> DevUniformCudaHipRt<TApi>
72 {
74
75 std::size_t const devCount = getDevCount(platform);
76 if(devIdx >= devCount)
77 {
78 std::stringstream ssErr;
79 ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount
80 << " devices!";
81 throw std::runtime_error(ssErr.str());
82 }
83
84 if(isDevUsable(devIdx))
85 {
86 DevUniformCudaHipRt<TApi> dev(static_cast<int>(devIdx));
87
88 // Log this device.
89# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
90 typename TApi::DeviceProp_t devProp;
91 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
92# endif
93# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
94 printDeviceProperties(devProp);
95# elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
96 std::cout << __func__ << devProp.name << std::endl;
97# endif
98 return dev;
99 }
100 else
101 {
102 std::stringstream ssErr;
103 ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
104 throw std::runtime_error(ssErr.str());
105 }
106 }
107
108 private:
109 //! \return If the device is usable.
110 ALPAKA_FN_HOST static auto isDevUsable(std::size_t iDevice) -> bool
111 {
112 typename TApi::Error_t rc = TApi::setDevice(static_cast<int>(iDevice));
113 typename TApi::Stream_t queue = {};
114 // Create a dummy queue to check if the device is already used by an other process.
115 // cuda/hip-SetDevice never returns an error if another process already uses the selected device and
116 // gpu compute mode is set "process exclusive". \TODO: Check if this workaround is needed!
117 if(rc == TApi::success)
118 {
119 rc = TApi::streamCreate(&queue);
120 }
121
122 if(rc == TApi::success)
123 {
124 // Destroy the dummy queue.
125 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamDestroy(queue));
126 return true;
127 }
128 else
129 {
130 // Return the previous error from cudaStreamCreate.
132 // Reset the Error state.
133 std::ignore = TApi::getLastError();
134 return false;
135 }
136 }
137
138# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
139 //! Prints all the device properties to std::cout.
140 ALPAKA_FN_HOST static auto printDeviceProperties(typename TApi::DeviceProp_t const& devProp) -> void
141 {
143
144 constexpr auto KiB = std::size_t{1024};
145 constexpr auto MiB = KiB * KiB;
146 std::cout << "name: " << devProp.name << std::endl;
147 std::cout << "totalGlobalMem: " << devProp.totalGlobalMem / MiB << " MiB" << std::endl;
148 std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock / KiB << " KiB" << std::endl;
149 std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
150 std::cout << "warpSize: " << devProp.warpSize << std::endl;
151 std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
152 std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1]
153 << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
154 std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", "
155 << devProp.maxGridSize[2] << ")" << std::endl;
156 std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
157 std::cout << "totalConstMem: " << devProp.totalConstMem / KiB << " KiB" << std::endl;
158 std::cout << "major: " << devProp.major << std::endl;
159 std::cout << "minor: " << devProp.minor << std::endl;
160
161 // std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl; // Deprecated
162 std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
163 std::cout << "integrated: " << devProp.integrated << std::endl;
164 std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
165 std::cout << "computeMode: " << devProp.computeMode << std::endl;
166 std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
167 std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
168 std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
169 std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
170 std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
171 std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
172 std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
173 std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
174 std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
175 if constexpr(std::is_same_v<TApi, ApiCudaRt>)
176 {
177 std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
178 std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
179 std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
180 std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
181 std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
182 std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
183 std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio
184 << std::endl;
185 std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
186 std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
187 std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
188 std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem
189 << std::endl;
190 std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
191 std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
192 std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
193 std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
194 std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1]
195 << std::endl;
196 std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x"
197 << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
198 std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x"
199 << devProp.maxTexture2DGather[1] << std::endl;
200 std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1]
201 << "x" << devProp.maxTexture3D[2] << std::endl;
202 std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
203 std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x"
204 << devProp.maxTexture1DLayered[1] << std::endl;
205 std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x"
206 << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
207 std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x"
208 << devProp.maxTextureCubemapLayered[1] << std::endl;
209 std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
210 std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1]
211 << std::endl;
212 std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1]
213 << "x" << devProp.maxSurface3D[2] << std::endl;
214 std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x"
215 << devProp.maxSurface1DLayered[1] << std::endl;
216 std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x"
217 << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
218 std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
219 std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x"
220 << devProp.maxSurfaceCubemapLayered[1] << std::endl;
221 std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
222 std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
223 std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
224 std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
225 std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
226 std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
227 std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
228 std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
229 std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
230 std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
231 }
232 else
233 { // ApiHipRt
234 std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
235 std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor / KiB
236 << " KiB" << std::endl;
237 std::cout << "gcnArchName: " << devProp.gcnArchName << std::endl;
238 std::cout << "arch: " << std::endl;
239 std::cout << " hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
240 std::cout << " hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch
241 << std::endl;
242 std::cout << " hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
243 std::cout << " hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch
244 << std::endl;
245 std::cout << " hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
246 std::cout << " hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
247 std::cout << " hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
248 std::cout << " hasDoubles: " << devProp.arch.hasDoubles << std::endl;
249 std::cout << " hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
250 std::cout << " hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
251 std::cout << " hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
252 std::cout << " hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
253 std::cout << " hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
254 std::cout << " hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
255 std::cout << " hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
256 std::cout << " has3dGrid: " << devProp.arch.has3dGrid << std::endl;
257 std::cout << " hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
258 }
259 }
260# endif
261 };
262 } // namespace trait
263} // namespace alpaka
264
265#endif
#define ALPAKA_DEBUG_FULL_LOG_SCOPE
Definition Debug.hpp:62
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
auto getNativeHandle() const noexcept -> int
#define ALPAKA_FN_HOST
Definition Common.hpp:40
The alpaka accelerator library.
ALPAKA_FN_HOST auto getDevCount(TPlatform const &platform)
Definition Traits.hpp:55
Tag used in class inheritance hierarchies that describes that a specific interface (TInterface) is im...
Definition Interface.hpp:15
The device type trait.
Definition Traits.hpp:23
static ALPAKA_FN_HOST auto getDevByIdx(PlatformUniformCudaHipRt< TApi > const &platform, std::size_t const &devIdx) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition DevCpu.hpp:41
static ALPAKA_FN_HOST auto getDevCount(PlatformUniformCudaHipRt< TApi > const &) -> std::size_t
The device count get trait.
Definition Traits.hpp:42