alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
ApiCudaRt.hpp
Go to the documentation of this file.
1/* Copyright 2022 Andrea Bocci
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include <boost/predef.h>
8
9#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
10# include <cuda_runtime_api.h>
11
12namespace alpaka
13{
14 struct ApiCudaRt
15 {
16 // Names
17 static constexpr char name[] = "Cuda";
18 static constexpr auto version = BOOST_PREDEF_MAKE_10_VVRRP(CUDART_VERSION);
19
20 // Types
21 using DeviceAttr_t = ::cudaDeviceAttr;
22 using DeviceProp_t = ::cudaDeviceProp;
23 using Error_t = ::cudaError_t;
24 using Event_t = ::cudaEvent_t;
25 using Extent_t = ::cudaExtent;
26 using Flag_t = unsigned int;
27 using FuncAttributes_t = ::cudaFuncAttributes;
28 using HostFn_t = void (*)(void* data); // same as cudaHostFn_t, without the CUDART_CB calling convention
29 using Limit_t = ::cudaLimit;
30 using Memcpy3DParms_t = ::cudaMemcpy3DParms;
31 using MemcpyKind_t = ::cudaMemcpyKind;
32 using PitchedPtr_t = ::cudaPitchedPtr;
33 using Pos_t = ::cudaPos;
34 using Stream_t = ::cudaStream_t;
35
36 // Constants
37 static constexpr Error_t success = ::cudaSuccess;
38 static constexpr Error_t errorNotReady = ::cudaErrorNotReady;
39 static constexpr Error_t errorHostMemoryAlreadyRegistered = ::cudaErrorHostMemoryAlreadyRegistered;
40 static constexpr Error_t errorHostMemoryNotRegistered = ::cudaErrorHostMemoryNotRegistered;
41 static constexpr Error_t errorUnsupportedLimit = ::cudaErrorUnsupportedLimit;
42 static constexpr Error_t errorUnknown = ::cudaErrorUnknown;
43
44 static constexpr Flag_t eventDefault = cudaEventDefault;
45 static constexpr Flag_t eventBlockingSync = cudaEventBlockingSync;
46 static constexpr Flag_t eventDisableTiming = cudaEventDisableTiming;
47 static constexpr Flag_t eventInterprocess = cudaEventInterprocess;
48
49 static constexpr Flag_t hostMallocDefault = cudaHostAllocDefault;
50 static constexpr Flag_t hostMallocMapped = cudaHostAllocMapped;
51 static constexpr Flag_t hostMallocPortable = cudaHostAllocPortable;
52 static constexpr Flag_t hostMallocWriteCombined = cudaHostAllocWriteCombined;
53 static constexpr Flag_t hostMallocCoherent = cudaHostAllocDefault; // Not supported.
54 static constexpr Flag_t hostMallocNonCoherent = cudaHostAllocDefault; // Not supported.
55
56 static constexpr Flag_t hostRegisterDefault = cudaHostRegisterDefault;
57 static constexpr Flag_t hostRegisterPortable = cudaHostRegisterPortable;
58 static constexpr Flag_t hostRegisterMapped = cudaHostRegisterMapped;
59 static constexpr Flag_t hostRegisterIoMemory = cudaHostRegisterIoMemory;
60
61 static constexpr MemcpyKind_t memcpyDefault = ::cudaMemcpyDefault;
62 static constexpr MemcpyKind_t memcpyDeviceToDevice = ::cudaMemcpyDeviceToDevice;
63 static constexpr MemcpyKind_t memcpyDeviceToHost = ::cudaMemcpyDeviceToHost;
64 static constexpr MemcpyKind_t memcpyHostToDevice = ::cudaMemcpyHostToDevice;
65
66 static constexpr Flag_t streamDefault = cudaStreamDefault;
67 static constexpr Flag_t streamNonBlocking = cudaStreamNonBlocking;
68
69 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::cudaDevAttrMaxBlockDimX;
70 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::cudaDevAttrMaxBlockDimY;
71 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::cudaDevAttrMaxBlockDimZ;
72 static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::cudaDevAttrMaxGridDimX;
73 static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::cudaDevAttrMaxGridDimY;
74 static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::cudaDevAttrMaxGridDimZ;
75 static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
76 static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
77 static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
78 static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
79
80 static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
81 static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
82
83 // Host function helper
84 // Encapsulates the different function signatures used by cudaStreamAddCallback and cudaLaunchHostFn, and the
85 // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
87 {
89 void* data_;
90
91 static void CUDART_CB hostFunction(void* data)
92 {
93 auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
94 ptr->func_(ptr->data_);
95 delete ptr;
96 }
97
98 static void CUDART_CB streamCallback(Stream_t, Error_t, void* data)
99 {
100 auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
101 ptr->func_(ptr->data_);
102 delete ptr;
103 }
104 };
105
106 // Runtime API
107 static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
108 {
109 return ::cudaDeviceGetAttribute(value, attr, device);
110 }
111
112 static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
113 {
114 return ::cudaDeviceGetLimit(pValue, limit);
115 }
116
117 static inline Error_t deviceReset()
118 {
119 return ::cudaDeviceReset();
120 }
121
122 static inline Error_t deviceSetLimit(Limit_t limit, size_t value)
123 {
124 return ::cudaDeviceSetLimit(limit, value);
125 }
126
128 {
129 return ::cudaDeviceSynchronize();
130 }
131
132 static inline Error_t eventCreate(Event_t* event)
133 {
134 return ::cudaEventCreate(event);
135 }
136
137 static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
138 {
139 return ::cudaEventCreateWithFlags(event, flags);
140 }
141
142 static inline Error_t eventDestroy(Event_t event)
143 {
144 return ::cudaEventDestroy(event);
145 }
146
147 static inline Error_t eventQuery(Event_t event)
148 {
149 return ::cudaEventQuery(event);
150 }
151
152 static inline Error_t eventRecord(Event_t event, Stream_t stream)
153 {
154 return ::cudaEventRecord(event, stream);
155 }
156
157 static inline Error_t eventSynchronize(Event_t event)
158 {
159 return ::cudaEventSynchronize(event);
160 }
161
162 static inline Error_t free(void* devPtr)
163 {
164 return ::cudaFree(devPtr);
165 }
166
167 static inline Error_t freeAsync([[maybe_unused]] void* devPtr, [[maybe_unused]] Stream_t stream)
168 {
169# if CUDART_VERSION >= 11020
170 return ::cudaFreeAsync(devPtr, stream);
171# else
172 // Not implemented.
173 return errorUnknown;
174# endif
175 }
176
177 static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
178 {
179 return ::cudaFuncGetAttributes(attr, func);
180 }
181
182 template<typename T>
183 static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
184 {
185# if BOOST_COMP_GNUC
186# pragma GCC diagnostic push
187# pragma GCC diagnostic ignored "-Wconditionally-supported"
188# endif
189 return ::cudaFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
190# if BOOST_COMP_GNUC
191# pragma GCC diagnostic pop
192# endif
193 }
194
195 static inline Error_t getDeviceCount(int* count)
196 {
197 return ::cudaGetDeviceCount(count);
198 }
199
200 static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
201 {
202 return ::cudaGetDeviceProperties(prop, device);
203 }
204
205 static inline char const* getErrorName(Error_t error)
206 {
207 return ::cudaGetErrorName(error);
208 }
209
210 static inline char const* getErrorString(Error_t error)
211 {
212 return ::cudaGetErrorString(error);
213 }
214
215 static inline Error_t getLastError()
216 {
217 return ::cudaGetLastError();
218 }
219
220 static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
221 {
222 return ::cudaGetSymbolAddress(devPtr, symbol);
223 }
224
225 template<class T>
226 static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
227 {
228 return ::cudaGetSymbolAddress(devPtr, symbol);
229 }
230
231 static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
232 {
233 return ::cudaHostGetDevicePointer(pDevice, pHost, flags);
234 }
235
236 static inline Error_t hostFree(void* ptr)
237 {
238 return ::cudaFreeHost(ptr);
239 }
240
241 static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
242 {
243 return ::cudaHostAlloc(ptr, size, flags);
244 }
245
246 static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
247 {
248 return ::cudaHostRegister(ptr, size, flags);
249 }
250
251 static inline Error_t hostUnregister(void* ptr)
252 {
253 return ::cudaHostUnregister(ptr);
254 }
255
256 static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
257 {
258# if CUDART_VERSION >= 10000
259 // Wrap the host function using the proper calling convention
260 return ::cudaLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
261# else
262 // Emulate cudaLaunchHostFunc using cudaStreamAddCallback with a callback adaptor.
263 return ::cudaStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
264# endif
265 }
266
267 static inline Error_t malloc(void** devPtr, size_t size)
268 {
269 return ::cudaMalloc(devPtr, size);
270 }
271
272 static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
273 {
274 return ::cudaMalloc3D(pitchedDevPtr, extent);
275 }
276
277 static inline Error_t mallocAsync(
278 [[maybe_unused]] void** devPtr,
279 [[maybe_unused]] size_t size,
280 [[maybe_unused]] Stream_t stream)
281 {
282# if CUDART_VERSION >= 11020
283 return ::cudaMallocAsync(devPtr, size, stream);
284# else
285 // Not implemented.
286 return errorUnknown;
287# endif
288 }
289
290 static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
291 {
292 return ::cudaMallocPitch(devPtr, pitch, width, height);
293 }
294
295 static inline Error_t memGetInfo(size_t* free, size_t* total)
296 {
297 return ::cudaMemGetInfo(free, total);
298 }
299
300 static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
301 {
302 return ::cudaMemcpy(dst, src, count, kind);
303 }
304
305 static inline Error_t memcpy2DAsync(
306 void* dst,
307 size_t dpitch,
308 void const* src,
309 size_t spitch,
310 size_t width,
311 size_t height,
312 MemcpyKind_t kind,
313 Stream_t stream)
314 {
315 return ::cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
316 }
317
318 static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
319 {
320 return ::cudaMemcpy3DAsync(p, stream);
321 }
322
323 static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
324 {
325 return ::cudaMemcpyAsync(dst, src, count, kind, stream);
326 }
327
328 static inline Error_t memset2DAsync(
329 void* devPtr,
330 size_t pitch,
331 int value,
332 size_t width,
333 size_t height,
334 Stream_t stream)
335 {
336 return ::cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
337 }
338
339 static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
340 {
341 return ::cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
342 }
343
344 static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
345 {
346 return ::cudaMemsetAsync(devPtr, value, count, stream);
347 }
348
349 static inline Error_t setDevice(int device)
350 {
351 return ::cudaSetDevice(device);
352 }
353
354 static inline Error_t streamCreate(Stream_t* pStream)
355 {
356 return ::cudaStreamCreate(pStream);
357 }
358
359 static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
360 {
361 return ::cudaStreamCreateWithFlags(pStream, flags);
362 }
363
364 static inline Error_t streamDestroy(Stream_t stream)
365 {
366 return ::cudaStreamDestroy(stream);
367 }
368
369 static inline Error_t streamQuery(Stream_t stream)
370 {
371 return ::cudaStreamQuery(stream);
372 }
373
374 static inline Error_t streamSynchronize(Stream_t stream)
375 {
376 return ::cudaStreamSynchronize(stream);
377 }
378
379 static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
380 {
381 return ::cudaStreamWaitEvent(stream, event, flags);
382 }
383
384 static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
385 {
386 return ::make_cudaPitchedPtr(d, p, xsz, ysz);
387 }
388
389 static inline Pos_t makePos(size_t x, size_t y, size_t z)
390 {
391 return ::make_cudaPos(x, y, z);
392 }
393
394 static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
395 {
396 return ::make_cudaExtent(w, h, d);
397 }
398 };
399
400} // namespace alpaka
401
402#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
The alpaka accelerator library.
static void CUDART_CB streamCallback(Stream_t, Error_t, void *data)
Definition ApiCudaRt.hpp:98
static void CUDART_CB hostFunction(void *data)
Definition ApiCudaRt.hpp:91
static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX
Definition ApiCudaRt.hpp:69
static Error_t getDeviceCount(int *count)
static Error_t hostMalloc(void **ptr, size_t size, Flag_t flags)
static constexpr Flag_t eventDefault
Definition ApiCudaRt.hpp:44
static Error_t deviceReset()
static constexpr Flag_t eventInterprocess
Definition ApiCudaRt.hpp:47
static Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
unsigned int Flag_t
Definition ApiCudaRt.hpp:26
static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount
Definition ApiCudaRt.hpp:77
static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ
Definition ApiCudaRt.hpp:71
static Error_t funcGetAttributes(FuncAttributes_t *attr, T *func)
::cudaStream_t Stream_t
Definition ApiCudaRt.hpp:34
::cudaMemcpy3DParms Memcpy3DParms_t
Definition ApiCudaRt.hpp:30
static constexpr Flag_t eventDisableTiming
Definition ApiCudaRt.hpp:46
static constexpr Flag_t hostMallocPortable
Definition ApiCudaRt.hpp:51
static Error_t memcpyAsync(void *dst, void const *src, size_t count, MemcpyKind_t kind, Stream_t stream)
static Error_t memset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, Stream_t stream)
static constexpr Flag_t hostMallocWriteCombined
Definition ApiCudaRt.hpp:52
static Error_t freeAsync(void *devPtr, Stream_t stream)
static Error_t streamDestroy(Stream_t stream)
::cudaEvent_t Event_t
Definition ApiCudaRt.hpp:24
static constexpr Flag_t eventBlockingSync
Definition ApiCudaRt.hpp:45
static constexpr char name[]
Definition ApiCudaRt.hpp:17
static Error_t getLastError()
static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock
Definition ApiCudaRt.hpp:75
::cudaPitchedPtr PitchedPtr_t
Definition ApiCudaRt.hpp:32
static constexpr Flag_t hostMallocDefault
Definition ApiCudaRt.hpp:49
static Error_t malloc(void **devPtr, size_t size)
static Error_t hostUnregister(void *ptr)
static Error_t hostFree(void *ptr)
static constexpr Flag_t hostRegisterPortable
Definition ApiCudaRt.hpp:57
static Error_t deviceSetLimit(Limit_t limit, size_t value)
static constexpr Limit_t limitMallocHeapSize
Definition ApiCudaRt.hpp:81
static Error_t memGetInfo(size_t *free, size_t *total)
static constexpr Error_t errorNotReady
Definition ApiCudaRt.hpp:38
static Error_t eventSynchronize(Event_t event)
static PitchedPtr_t makePitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
static constexpr Flag_t hostMallocNonCoherent
Definition ApiCudaRt.hpp:54
static constexpr Limit_t limitPrintfFifoSize
Definition ApiCudaRt.hpp:80
static constexpr auto version
Definition ApiCudaRt.hpp:18
static Error_t streamCreate(Stream_t *pStream)
static Error_t mallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height)
static Error_t funcGetAttributes(FuncAttributes_t *attr, void const *func)
static constexpr Flag_t streamNonBlocking
Definition ApiCudaRt.hpp:67
static Error_t hostGetDevicePointer(void **pDevice, void *pHost, Flag_t flags)
static constexpr Flag_t hostMallocMapped
Definition ApiCudaRt.hpp:50
static Extent_t makeExtent(size_t w, size_t h, size_t d)
static Error_t deviceSynchronize()
::cudaMemcpyKind MemcpyKind_t
Definition ApiCudaRt.hpp:31
static constexpr Error_t success
Definition ApiCudaRt.hpp:37
static Error_t eventQuery(Event_t event)
static Error_t deviceGetAttribute(int *value, DeviceAttr_t attr, int device)
static constexpr Flag_t hostRegisterDefault
Definition ApiCudaRt.hpp:56
static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock
Definition ApiCudaRt.hpp:76
::cudaError_t Error_t
Definition ApiCudaRt.hpp:23
static constexpr Error_t errorUnknown
Definition ApiCudaRt.hpp:42
static Error_t getSymbolAddress(void **devPtr, T const &symbol)
static Error_t memcpy2DAsync(void *dst, size_t dpitch, void const *src, size_t spitch, size_t width, size_t height, MemcpyKind_t kind, Stream_t stream)
::cudaFuncAttributes FuncAttributes_t
Definition ApiCudaRt.hpp:27
static Error_t free(void *devPtr)
static constexpr MemcpyKind_t memcpyDeviceToDevice
Definition ApiCudaRt.hpp:62
static char const * getErrorString(Error_t error)
static Error_t streamSynchronize(Stream_t stream)
static Pos_t makePos(size_t x, size_t y, size_t z)
static constexpr MemcpyKind_t memcpyHostToDevice
Definition ApiCudaRt.hpp:64
static constexpr Flag_t hostRegisterMapped
Definition ApiCudaRt.hpp:58
void(*)(void *data) HostFn_t
Definition ApiCudaRt.hpp:28
static Error_t mallocAsync(void **devPtr, size_t size, Stream_t stream)
static char const * getErrorName(Error_t error)
static Error_t streamCreateWithFlags(Stream_t *pStream, Flag_t flags)
static Error_t streamQuery(Stream_t stream)
static constexpr Error_t errorHostMemoryAlreadyRegistered
Definition ApiCudaRt.hpp:39
static Error_t getDeviceProperties(DeviceProp_t *prop, int device)
static constexpr Flag_t hostMallocCoherent
Definition ApiCudaRt.hpp:53
static Error_t getSymbolAddress(void **devPtr, void const *symbol)
static Error_t memcpy3DAsync(Memcpy3DParms_t const *p, Stream_t stream)
static Error_t eventRecord(Event_t event, Stream_t stream)
static Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void *userData)
::cudaLimit Limit_t
Definition ApiCudaRt.hpp:29
static Error_t eventCreate(Event_t *event)
static constexpr Flag_t hostRegisterIoMemory
Definition ApiCudaRt.hpp:59
::cudaExtent Extent_t
Definition ApiCudaRt.hpp:25
static constexpr DeviceAttr_t deviceAttributeMaxGridDimX
Definition ApiCudaRt.hpp:72
static constexpr Flag_t streamDefault
Definition ApiCudaRt.hpp:66
static Error_t hostRegister(void *ptr, size_t size, Flag_t flags)
static Error_t setDevice(int device)
static constexpr MemcpyKind_t memcpyDeviceToHost
Definition ApiCudaRt.hpp:63
::cudaDeviceAttr DeviceAttr_t
Definition ApiCudaRt.hpp:21
static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY
Definition ApiCudaRt.hpp:70
static Error_t malloc3D(PitchedPtr_t *pitchedDevPtr, Extent_t extent)
static Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
::cudaDeviceProp DeviceProp_t
Definition ApiCudaRt.hpp:22
static Error_t memcpy(void *dst, void const *src, size_t count, MemcpyKind_t kind)
static constexpr MemcpyKind_t memcpyDefault
Definition ApiCudaRt.hpp:61
static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ
Definition ApiCudaRt.hpp:74
static Error_t deviceGetLimit(size_t *pValue, Limit_t limit)
static Error_t eventCreateWithFlags(Event_t *event, Flag_t flags)
static constexpr Error_t errorHostMemoryNotRegistered
Definition ApiCudaRt.hpp:40
static constexpr DeviceAttr_t deviceAttributeWarpSize
Definition ApiCudaRt.hpp:78
static Error_t memsetAsync(void *devPtr, int value, size_t count, Stream_t stream)
static constexpr DeviceAttr_t deviceAttributeMaxGridDimY
Definition ApiCudaRt.hpp:73
static Error_t eventDestroy(Event_t event)
static constexpr Error_t errorUnsupportedLimit
Definition ApiCudaRt.hpp:41