alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Vectorize.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
8
9#include <cstddef>
10#include <cstdint>
11
12//! Suggests vectorization of the directly following loop to the compiler.
13//!
14//! Usage:
15//! `ALPAKA_VECTORIZE_HINT
16//! for(...){...}`
17// \TODO: Implement for other compilers.
18// See: http://stackoverflow.com/questions/2706286/pragmas-swp-ivdep-prefetch-support-in-various-compilers
19/*
20#if ALPAKA_COMP_PGI
21 #define ALPAKA_VECTORIZE_HINT(...) _Pragma("vector")
22#elif ALPAKA_COMP_MSVC
23 #define ALPAKA_VECTORIZE_HINT(...) __pragma(loop(ivdep))
24#elif ALPAKA_COMP_GNUC
25 #define ALPAKA_VECTORIZE_HINT(...) _Pragma("GCC ivdep")
26#else
27 #define ALPAKA_VECTORIZE_HINT(...)
28#endif*/
29
31{
32 // The alignment required to enable optimal performance dependant on the target architecture.
33 constexpr std::size_t defaultAlignment =
34#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__MIC__)
35 64u
36#elif defined(__AVX__) || defined(__AVX2__)
37 32u
38#else
39 16u
40#endif
41 ;
42
43 // Number of elements of the given type that can be processed in parallel in a vector register.
44 // By default there is no vectorization.
45 template<typename TElem>
47 {
48 static constexpr std::size_t value = 1u;
49 };
50
51 // Number of elements of the given type that can be processed in parallel in a vector register.
52 template<>
54 {
55 static constexpr std::size_t value =
56#if defined(__AVX512F__) || defined(__MIC__)
57 // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
58 // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
59 // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
60 8u;
61#elif defined(__AVX__)
62 // addition (AVX): vaddpd / _mm256_add_pd
63 // subtraction (AVX): vsubpd / _mm256_sub_pd
64 // multiplication (AVX): vmulpd / _mm256_mul_pd
65 4u;
66#elif defined(__SSE2__)
67 // addition (SSE2): addpd / _mm_add_pd
68 // subtraction (SSE2): subpd / _mm_sub_pd
69 // multiplication (SSE2): mulpd / _mm_mul_pd
70 2u;
71#elif defined(__ARM_NEON__)
72 // No support for double precision vectorization!
73 1u;
74#elif defined(__ALTIVEC__)
75 2u;
76#else
77 1u;
78#endif
79 };
80
81 // Number of elements of the given type that can be processed in parallel in a vector register.
82 template<>
84 {
85 static constexpr std::size_t value =
86#if defined(__AVX512F__) || defined(__MIC__)
87 // addition (AVX512F,KNC): vaddps / _mm512_add_ps
88 // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
89 // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
90 16u;
91#elif defined(__AVX__)
92 // addition (AVX): vaddps / _mm256_add_ps
93 // subtraction (AVX): vsubps / _mm256_sub_ps
94 // multiplication (AVX): vmulps / _mm256_mul_ps
95 8u;
96#elif defined(__SSE__)
97 // addition (SSE): addps / _mm_add_ps
98 // subtraction (SSE): subps / _mm_sub_ps
99 // multiplication (SSE): mulps / _mm_mul_ps
100 4u;
101#elif defined(__ARM_NEON__)
102 4u;
103#elif defined(__ALTIVEC__)
104 4u;
105#else
106 1u;
107#endif
108 };
109
110 // Number of elements of the given type that can be processed in parallel in a vector register.
111 template<>
113 {
114 static constexpr std::size_t value =
115#if defined(__AVX512BW__)
116 // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
117 // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
118 // multiplication: -
119 64u;
120#elif defined(__AVX2__)
121 // addition (AVX2): vpaddb / _mm256_add_epi8
122 // subtraction (AVX2): vpsubb / _mm256_sub_epi8
123 // multiplication: -
124 32u;
125#elif defined(__SSE2__)
126 // addition (SSE2): paddb / _mm_add_epi8
127 // subtraction (SSE2): psubb / _mm_sub_epi8
128 // multiplication: -
129 16u;
130#elif defined(__ARM_NEON__)
131 16u;
132#elif defined(__ALTIVEC__)
133 16u;
134#elif defined(__CUDA_ARCH__)
135 // addition: __vadd4
136 // subtraction: __vsub4
137 // multiplication: -
138 4u;
139#else
140 1u;
141#endif
142 };
143
144 // Number of elements of the given type that can be processed in parallel in a vector register.
145 template<>
147 {
148 static constexpr std::size_t value =
149#if defined(__AVX512BW__)
150 // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
151 // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
152 // multiplication: -
153 64u;
154#elif defined(__AVX2__)
155 // addition (AVX2): vpaddb / _mm256_add_epi8
156 // subtraction (AVX2): vpsubb / _mm256_sub_epi8
157 // multiplication: -
158 32u;
159#elif defined(__SSE2__)
160 // addition (SSE2): paddb / _mm_add_epi8
161 // subtraction (SSE2): psubb / _mm_sub_epi8
162 // multiplication: -
163 16u;
164#elif defined(__ARM_NEON__)
165 16u;
166#elif defined(__ALTIVEC__)
167 16u;
168#elif defined(__CUDA_ARCH__)
169 // addition: __vadd4
170 // subtraction: __vsub4
171 // multiplication: -
172 4u;
173#else
174 1u;
175#endif
176 };
177
178 // Number of elements of the given type that can be processed in parallel in a vector register.
179 template<>
181 {
182 static constexpr std::size_t value =
183#if defined(__AVX512BW__)
184 // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
185 // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
186 // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
187 32u;
188#elif defined(__AVX2__)
189 // addition (AVX2): vpaddw / _mm256_add_epi16
190 // subtraction (AVX2): vpsubw / _mm256_sub_epi16
191 // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
192 16u;
193#elif defined(__SSE2__)
194 // addition (SSE2): paddw / _mm_add_epi16
195 // subtraction (SSE2): psubw / _mm_sub_epi16
196 // multiplication (SSE2): pmullw / _mm_mullo_epi16
197 8u;
198#elif defined(__ARM_NEON__)
199 8u;
200#elif defined(__ALTIVEC__)
201 8u;
202#elif defined(__CUDA_ARCH__)
203 // addition: __vadd2
204 // subtraction: __vsub2
205 // multiplication: -
206 2u;
207#else
208 1u;
209#endif
210 };
211
212 // Number of elements of the given type that can be processed in parallel in a vector register.
213 template<>
215 {
216 static constexpr std::size_t value =
217#if defined(__AVX512BW__)
218 // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
219 // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
220 // multiplication: ?
221 32u;
222#elif defined(__AVX2__)
223 // addition (AVX2): vpaddusw / _mm256_adds_epu16
224 // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
225 // multiplication: ?
226 16u;
227#elif defined(__SSE2__)
228 // addition (SSE2): paddusw / _mm_adds_epu16
229 // subtraction (SSE2): psubusw / _mm_subs_epu16
230 // multiplication: ?
231 8u;
232#elif defined(__ARM_NEON__)
233 8u;
234#elif defined(__ALTIVEC__)
235 8u;
236#elif defined(__CUDA_ARCH__)
237 // addition: __vadd2
238 // subtraction: __vsub2
239 // multiplication: -
240 2u;
241#else
242 1u;
243#endif
244 };
245
246 // Number of elements of the given type that can be processed in parallel in a vector register.
247 template<>
249 {
250 static constexpr std::size_t value =
251#if defined(__AVX512F__) || defined(__MIC__)
252 // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
253 // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
254 // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
255 16u;
256#elif defined(__AVX2__)
257 // addition (AVX2): vpaddd / _mm256_add_epi32
258 // subtraction (AVX2): vpsubd / _mm256_sub_epi32
259 // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
260 8u;
261#elif defined(__SSE2__)
262 // addition (SSE2): paddd / _mm_add_epi32
263 // subtraction (SSE2): psubd / _mm_sub_epi32
264 // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
265 4u;
266#elif defined(__ARM_NEON__)
267 4u;
268#elif defined(__ALTIVEC__)
269 4u;
270#else
271 1u;
272#endif
273 };
274
275 // Number of elements of the given type that can be processed in parallel in a vector register.
276 template<>
278 {
279 static constexpr std::size_t value =
280#if defined(__AVX512F__) || defined(__MIC__)
281 // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
282 // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
283 // multiplication: ?
284 16u;
285#elif defined(__AVX2__)
286 // addition (AVX2): vpaddd / _mm256_add_epi32
287 // subtraction (AVX2): vpsubd / _mm256_sub_epi32
288 // multiplication: ?
289 8u;
290#elif defined(__SSE2__)
291 // addition (SSE2): paddd / _mm_add_epi32
292 // subtraction (SSE2): psubd / _mm_sub_epi32
293 // multiplication: ?
294 4u;
295#elif defined(__ARM_NEON__)
296 4u;
297#elif defined(__ALTIVEC__)
298 4u;
299#else
300 1u;
301#endif
302 };
303
304 // Number of elements of the given type that can be processed in parallel in a vector register.
305 template<>
307 {
308 static constexpr std::size_t value =
309#if defined(__AVX512F__)
310 // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
311 // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
312 // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
313 8u;
314#elif defined(__AVX2__)
315 // addition (AVX2): vpaddq / _mm256_add_epi64
316 // subtraction (AVX2): vpsubq / _mm256_sub_epi64
317 // multiplication: -
318 4u;
319#elif defined(__SSE2__)
320 // addition (SSE2): paddq / _mm_add_epi64
321 // subtraction (SSE2): psubq / _mm_sub_epi64
322 // multiplication: -
323 2u;
324#elif defined(__ARM_NEON__)
325 2u;
326#else
327 1u;
328#endif
329 };
330
331 // Number of elements of the given type that can be processed in parallel in a vector register.
332 template<>
334 {
335 static constexpr std::size_t value =
336#if defined(__AVX512F__)
337 // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
338 // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
339 // multiplication: ?
340 8u;
341#elif defined(__AVX2__)
342 // addition (AVX2): vpaddq / _mm256_add_epi64
343 // subtraction (AVX2): vpsubq / _mm256_sub_epi64
344 // multiplication: ?
345 4u;
346#elif defined(__SSE2__)
347 // addition (SSE2): paddq / _mm_add_epi64
348 // subtraction (SSE2): psubq / _mm_sub_epi64
349 // multiplication: ?
350 2u;
351#elif defined(__ARM_NEON__)
352 2u;
353#else
354 1u;
355#endif
356 };
357} // namespace alpaka::core::vectorization
Suggests vectorization of the directly following loop to the compiler.
Definition Vectorize.hpp:31
constexpr std::size_t defaultAlignment
Definition Vectorize.hpp:33
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
STL namespace.