alpaka
Abstraction Library for Parallel Kernel Acceleration
PlatformUniformCudaHipRt.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, RenĂ© Widera, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato,
2  * Christian Kaever
3  * SPDX-License-Identifier: MPL-2.0
4  */
5 
6 #pragma once
7 
9 #include "alpaka/core/Cuda.hpp"
10 #include "alpaka/core/Hip.hpp"
12 #include "alpaka/dev/Traits.hpp"
13 
14 #include <iostream>
15 #include <sstream>
16 #include <stdexcept>
17 #include <tuple>
18 
19 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
20 
21 namespace alpaka
22 {
23  // Forward declarations.
24  struct ApiCudaRt;
25  struct ApiHipRt;
26 
27  //! The CUDA/HIP RT platform.
28  template<typename TApi>
29  struct PlatformUniformCudaHipRt : concepts::Implements<ConceptPlatform, PlatformUniformCudaHipRt<TApi>>
30  {
31 # if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \
32  && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0)
33  // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295
34  // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized"
35  char c = {};
36 # endif
37  };
38 
39  namespace trait
40  {
41  //! The CUDA/HIP RT platform device type trait specialization.
42  template<typename TApi>
44  {
46  };
47 
48  //! The CUDA/HIP RT platform device count get trait specialization.
49  template<typename TApi>
51  {
52  ALPAKA_FN_HOST static auto getDevCount(PlatformUniformCudaHipRt<TApi> const&) -> std::size_t
53  {
55 
56  int iNumDevices(0);
57  typename TApi::Error_t error = TApi::getDeviceCount(&iNumDevices);
58  if(error != TApi::success)
59  iNumDevices = 0;
60 
61  return static_cast<std::size_t>(iNumDevices);
62  }
63  };
64 
65  //! The CUDA/HIP RT platform device get trait specialization.
66  template<typename TApi>
68  {
70  PlatformUniformCudaHipRt<TApi> const& platform,
71  std::size_t const& devIdx) -> DevUniformCudaHipRt<TApi>
72  {
74 
75  std::size_t const devCount = getDevCount(platform);
76  if(devIdx >= devCount)
77  {
78  std::stringstream ssErr;
79  ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount
80  << " devices!";
81  throw std::runtime_error(ssErr.str());
82  }
83 
84  if(isDevUsable(devIdx))
85  {
86  DevUniformCudaHipRt<TApi> dev(static_cast<int>(devIdx));
87 
88  // Log this device.
89 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
90  typename TApi::DeviceProp_t devProp;
91  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
92 # endif
93 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
94  printDeviceProperties(devProp);
95 # elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
96  std::cout << __func__ << devProp.name << std::endl;
97 # endif
98  return dev;
99  }
100  else
101  {
102  std::stringstream ssErr;
103  ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
104  throw std::runtime_error(ssErr.str());
105  }
106  }
107 
108  private:
109  //! \return If the device is usable.
110  ALPAKA_FN_HOST static auto isDevUsable(std::size_t iDevice) -> bool
111  {
112  typename TApi::Error_t rc = TApi::setDevice(static_cast<int>(iDevice));
113  typename TApi::Stream_t queue = {};
114  // Create a dummy queue to check if the device is already used by an other process.
115  // cuda/hip-SetDevice never returns an error if another process already uses the selected device and
116  // gpu compute mode is set "process exclusive". \TODO: Check if this workaround is needed!
117  if(rc == TApi::success)
118  {
119  rc = TApi::streamCreate(&queue);
120  }
121 
122  if(rc == TApi::success)
123  {
124  // Destroy the dummy queue.
125  ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamDestroy(queue));
126  return true;
127  }
128  else
129  {
130  // Return the previous error from cudaStreamCreate.
132  // Reset the Error state.
133  std::ignore = TApi::getLastError();
134  return false;
135  }
136  }
137 
138 # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
139  //! Prints all the device properties to std::cout.
140  ALPAKA_FN_HOST static auto printDeviceProperties(typename TApi::DeviceProp_t const& devProp) -> void
141  {
143 
144  constexpr auto KiB = std::size_t{1024};
145  constexpr auto MiB = KiB * KiB;
146  std::cout << "name: " << devProp.name << std::endl;
147  std::cout << "totalGlobalMem: " << devProp.totalGlobalMem / MiB << " MiB" << std::endl;
148  std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock / KiB << " KiB" << std::endl;
149  std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
150  std::cout << "warpSize: " << devProp.warpSize << std::endl;
151  std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
152  std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1]
153  << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
154  std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", "
155  << devProp.maxGridSize[2] << ")" << std::endl;
156  std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
157  std::cout << "totalConstMem: " << devProp.totalConstMem / KiB << " KiB" << std::endl;
158  std::cout << "major: " << devProp.major << std::endl;
159  std::cout << "minor: " << devProp.minor << std::endl;
160 
161  // std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl; // Deprecated
162  std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
163  std::cout << "integrated: " << devProp.integrated << std::endl;
164  std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
165  std::cout << "computeMode: " << devProp.computeMode << std::endl;
166  std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
167  std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
168  std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
169  std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
170  std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
171  std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
172  std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
173  std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
174  std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
175  if constexpr(std::is_same_v<TApi, ApiCudaRt>)
176  {
177  std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
178  std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
179  std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
180  std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
181  std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
182  std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
183  std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio
184  << std::endl;
185  std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
186  std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
187  std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
188  std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem
189  << std::endl;
190  std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
191  std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
192  std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
193  std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
194  std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1]
195  << std::endl;
196  std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x"
197  << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
198  std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x"
199  << devProp.maxTexture2DGather[1] << std::endl;
200  std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1]
201  << "x" << devProp.maxTexture3D[2] << std::endl;
202  std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
203  std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x"
204  << devProp.maxTexture1DLayered[1] << std::endl;
205  std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x"
206  << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
207  std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x"
208  << devProp.maxTextureCubemapLayered[1] << std::endl;
209  std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
210  std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1]
211  << std::endl;
212  std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1]
213  << "x" << devProp.maxSurface3D[2] << std::endl;
214  std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x"
215  << devProp.maxSurface1DLayered[1] << std::endl;
216  std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x"
217  << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
218  std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
219  std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x"
220  << devProp.maxSurfaceCubemapLayered[1] << std::endl;
221  std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
222  std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
223  std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
224  std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
225  std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
226  std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
227  std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
228  std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
229  std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
230  std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
231  }
232  else
233  { // ApiHipRt
234  std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
235  std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor / KiB
236  << " KiB" << std::endl;
237  std::cout << "gcnArch: " << devProp.gcnArch << std::endl;
238  std::cout << "arch: " << std::endl;
239  std::cout << " hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
240  std::cout << " hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch
241  << std::endl;
242  std::cout << " hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
243  std::cout << " hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch
244  << std::endl;
245  std::cout << " hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
246  std::cout << " hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
247  std::cout << " hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
248  std::cout << " hasDoubles: " << devProp.arch.hasDoubles << std::endl;
249  std::cout << " hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
250  std::cout << " hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
251  std::cout << " hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
252  std::cout << " hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
253  std::cout << " hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
254  std::cout << " hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
255  std::cout << " hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
256  std::cout << " has3dGrid: " << devProp.arch.has3dGrid << std::endl;
257  std::cout << " hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
258  }
259  }
260 # endif
261  };
262  } // namespace trait
263 } // namespace alpaka
264 
265 #endif
#define ALPAKA_DEBUG_FULL_LOG_SCOPE
Definition: Debug.hpp:62
#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)
CUDA/HIP runtime error checking with log and exception.
The CUDA/HIP RT device handle.
auto getNativeHandle() const noexcept -> int
#define ALPAKA_FN_HOST
Definition: Common.hpp:40
The alpaka accelerator library.
ALPAKA_FN_HOST auto getDevCount(TPlatform const &platform)
Definition: Traits.hpp:55
Tag used in class inheritance hierarchies that describes that a specific concept (TConcept) is implem...
Definition: Concepts.hpp:15
The device type trait.
Definition: Traits.hpp:23
static ALPAKA_FN_HOST auto getDevByIdx(PlatformUniformCudaHipRt< TApi > const &platform, std::size_t const &devIdx) -> DevUniformCudaHipRt< TApi >
The device get trait.
Definition: DevCpu.hpp:41
static ALPAKA_FN_HOST auto getDevCount(PlatformUniformCudaHipRt< TApi > const &) -> std::size_t
The device count get trait.
Definition: Traits.hpp:42