alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Vectorize.hpp
Go to the documentation of this file.
1/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
8
9#include <cstddef>
10#include <cstdint>
11
12//! Suggests vectorization of the directly following loop to the compiler.
13//!
14//! Usage:
15//! `ALPAKA_VECTORIZE_HINT
16//! for(...){...}`
17// \TODO: Implement for other compilers.
18// See: http://stackoverflow.com/questions/2706286/pragmas-swp-ivdep-prefetch-support-in-various-compilers
19/*#if BOOST_COMP_HPACC
20 #define ALPAKA_VECTORIZE_HINT(...) _Pragma("ivdep")
21#elif BOOST_COMP_PGI
22 #define ALPAKA_VECTORIZE_HINT(...) _Pragma("vector")
23#elif BOOST_COMP_MSVC
24 #define ALPAKA_VECTORIZE_HINT(...) __pragma(loop(ivdep))
25#elif BOOST_COMP_GNUC
26 #define ALPAKA_VECTORIZE_HINT(...) _Pragma("GCC ivdep")
27#else
28 #define ALPAKA_VECTORIZE_HINT(...)
29#endif*/
30
32{
33 // The alignment required to enable optimal performance dependant on the target architecture.
34 constexpr std::size_t defaultAlignment =
35#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__MIC__)
36 64u
37#elif defined(__AVX__) || defined(__AVX2__)
38 32u
39#else
40 16u
41#endif
42 ;
43
44 // Number of elements of the given type that can be processed in parallel in a vector register.
45 // By default there is no vectorization.
46 template<typename TElem>
48 {
49 static constexpr std::size_t value = 1u;
50 };
51
52 // Number of elements of the given type that can be processed in parallel in a vector register.
53 template<>
55 {
56 static constexpr std::size_t value =
57#if defined(__AVX512F__) || defined(__MIC__)
58 // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
59 // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
60 // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
61 8u;
62#elif defined(__AVX__)
63 // addition (AVX): vaddpd / _mm256_add_pd
64 // subtraction (AVX): vsubpd / _mm256_sub_pd
65 // multiplication (AVX): vmulpd / _mm256_mul_pd
66 4u;
67#elif defined(__SSE2__)
68 // addition (SSE2): addpd / _mm_add_pd
69 // subtraction (SSE2): subpd / _mm_sub_pd
70 // multiplication (SSE2): mulpd / _mm_mul_pd
71 2u;
72#elif defined(__ARM_NEON__)
73 // No support for double precision vectorization!
74 1u;
75#elif defined(__ALTIVEC__)
76 2u;
77#else
78 1u;
79#endif
80 };
81
82 // Number of elements of the given type that can be processed in parallel in a vector register.
83 template<>
85 {
86 static constexpr std::size_t value =
87#if defined(__AVX512F__) || defined(__MIC__)
88 // addition (AVX512F,KNC): vaddps / _mm512_add_ps
89 // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
90 // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
91 16u;
92#elif defined(__AVX__)
93 // addition (AVX): vaddps / _mm256_add_ps
94 // subtraction (AVX): vsubps / _mm256_sub_ps
95 // multiplication (AVX): vmulps / _mm256_mul_ps
96 8u;
97#elif defined(__SSE__)
98 // addition (SSE): addps / _mm_add_ps
99 // subtraction (SSE): subps / _mm_sub_ps
100 // multiplication (SSE): mulps / _mm_mul_ps
101 4u;
102#elif defined(__ARM_NEON__)
103 4u;
104#elif defined(__ALTIVEC__)
105 4u;
106#else
107 1u;
108#endif
109 };
110
111 // Number of elements of the given type that can be processed in parallel in a vector register.
112 template<>
114 {
115 static constexpr std::size_t value =
116#if defined(__AVX512BW__)
117 // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
118 // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
119 // multiplication: -
120 64u;
121#elif defined(__AVX2__)
122 // addition (AVX2): vpaddb / _mm256_add_epi8
123 // subtraction (AVX2): vpsubb / _mm256_sub_epi8
124 // multiplication: -
125 32u;
126#elif defined(__SSE2__)
127 // addition (SSE2): paddb / _mm_add_epi8
128 // subtraction (SSE2): psubb / _mm_sub_epi8
129 // multiplication: -
130 16u;
131#elif defined(__ARM_NEON__)
132 16u;
133#elif defined(__ALTIVEC__)
134 16u;
135#elif defined(__CUDA_ARCH__)
136 // addition: __vadd4
137 // subtraction: __vsub4
138 // multiplication: -
139 4u;
140#else
141 1u;
142#endif
143 };
144
145 // Number of elements of the given type that can be processed in parallel in a vector register.
146 template<>
148 {
149 static constexpr std::size_t value =
150#if defined(__AVX512BW__)
151 // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
152 // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
153 // multiplication: -
154 64u;
155#elif defined(__AVX2__)
156 // addition (AVX2): vpaddb / _mm256_add_epi8
157 // subtraction (AVX2): vpsubb / _mm256_sub_epi8
158 // multiplication: -
159 32u;
160#elif defined(__SSE2__)
161 // addition (SSE2): paddb / _mm_add_epi8
162 // subtraction (SSE2): psubb / _mm_sub_epi8
163 // multiplication: -
164 16u;
165#elif defined(__ARM_NEON__)
166 16u;
167#elif defined(__ALTIVEC__)
168 16u;
169#elif defined(__CUDA_ARCH__)
170 // addition: __vadd4
171 // subtraction: __vsub4
172 // multiplication: -
173 4u;
174#else
175 1u;
176#endif
177 };
178
179 // Number of elements of the given type that can be processed in parallel in a vector register.
180 template<>
182 {
183 static constexpr std::size_t value =
184#if defined(__AVX512BW__)
185 // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
186 // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
187 // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
188 32u;
189#elif defined(__AVX2__)
190 // addition (AVX2): vpaddw / _mm256_add_epi16
191 // subtraction (AVX2): vpsubw / _mm256_sub_epi16
192 // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
193 16u;
194#elif defined(__SSE2__)
195 // addition (SSE2): paddw / _mm_add_epi16
196 // subtraction (SSE2): psubw / _mm_sub_epi16
197 // multiplication (SSE2): pmullw / _mm_mullo_epi16
198 8u;
199#elif defined(__ARM_NEON__)
200 8u;
201#elif defined(__ALTIVEC__)
202 8u;
203#elif defined(__CUDA_ARCH__)
204 // addition: __vadd2
205 // subtraction: __vsub2
206 // multiplication: -
207 2u;
208#else
209 1u;
210#endif
211 };
212
213 // Number of elements of the given type that can be processed in parallel in a vector register.
214 template<>
216 {
217 static constexpr std::size_t value =
218#if defined(__AVX512BW__)
219 // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
220 // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
221 // multiplication: ?
222 32u;
223#elif defined(__AVX2__)
224 // addition (AVX2): vpaddusw / _mm256_adds_epu16
225 // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
226 // multiplication: ?
227 16u;
228#elif defined(__SSE2__)
229 // addition (SSE2): paddusw / _mm_adds_epu16
230 // subtraction (SSE2): psubusw / _mm_subs_epu16
231 // multiplication: ?
232 8u;
233#elif defined(__ARM_NEON__)
234 8u;
235#elif defined(__ALTIVEC__)
236 8u;
237#elif defined(__CUDA_ARCH__)
238 // addition: __vadd2
239 // subtraction: __vsub2
240 // multiplication: -
241 2u;
242#else
243 1u;
244#endif
245 };
246
247 // Number of elements of the given type that can be processed in parallel in a vector register.
248 template<>
250 {
251 static constexpr std::size_t value =
252#if defined(__AVX512F__) || defined(__MIC__)
253 // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
254 // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
255 // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
256 16u;
257#elif defined(__AVX2__)
258 // addition (AVX2): vpaddd / _mm256_add_epi32
259 // subtraction (AVX2): vpsubd / _mm256_sub_epi32
260 // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
261 8u;
262#elif defined(__SSE2__)
263 // addition (SSE2): paddd / _mm_add_epi32
264 // subtraction (SSE2): psubd / _mm_sub_epi32
265 // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
266 4u;
267#elif defined(__ARM_NEON__)
268 4u;
269#elif defined(__ALTIVEC__)
270 4u;
271#else
272 1u;
273#endif
274 };
275
276 // Number of elements of the given type that can be processed in parallel in a vector register.
277 template<>
279 {
280 static constexpr std::size_t value =
281#if defined(__AVX512F__) || defined(__MIC__)
282 // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
283 // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
284 // multiplication: ?
285 16u;
286#elif defined(__AVX2__)
287 // addition (AVX2): vpaddd / _mm256_add_epi32
288 // subtraction (AVX2): vpsubd / _mm256_sub_epi32
289 // multiplication: ?
290 8u;
291#elif defined(__SSE2__)
292 // addition (SSE2): paddd / _mm_add_epi32
293 // subtraction (SSE2): psubd / _mm_sub_epi32
294 // multiplication: ?
295 4u;
296#elif defined(__ARM_NEON__)
297 4u;
298#elif defined(__ALTIVEC__)
299 4u;
300#else
301 1u;
302#endif
303 };
304
305 // Number of elements of the given type that can be processed in parallel in a vector register.
306 template<>
308 {
309 static constexpr std::size_t value =
310#if defined(__AVX512F__)
311 // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
312 // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
313 // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
314 8u;
315#elif defined(__AVX2__)
316 // addition (AVX2): vpaddq / _mm256_add_epi64
317 // subtraction (AVX2): vpsubq / _mm256_sub_epi64
318 // multiplication: -
319 4u;
320#elif defined(__SSE2__)
321 // addition (SSE2): paddq / _mm_add_epi64
322 // subtraction (SSE2): psubq / _mm_sub_epi64
323 // multiplication: -
324 2u;
325#elif defined(__ARM_NEON__)
326 2u;
327#else
328 1u;
329#endif
330 };
331
332 // Number of elements of the given type that can be processed in parallel in a vector register.
333 template<>
335 {
336 static constexpr std::size_t value =
337#if defined(__AVX512F__)
338 // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
339 // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
340 // multiplication: ?
341 8u;
342#elif defined(__AVX2__)
343 // addition (AVX2): vpaddq / _mm256_add_epi64
344 // subtraction (AVX2): vpsubq / _mm256_sub_epi64
345 // multiplication: ?
346 4u;
347#elif defined(__SSE2__)
348 // addition (SSE2): paddq / _mm_add_epi64
349 // subtraction (SSE2): psubq / _mm_sub_epi64
350 // multiplication: ?
351 2u;
352#elif defined(__ARM_NEON__)
353 2u;
354#else
355 1u;
356#endif
357 };
358} // namespace alpaka::core::vectorization
Suggests vectorization of the directly following loop to the compiler.
Definition Vectorize.hpp:32
constexpr std::size_t defaultAlignment
Definition Vectorize.hpp:34
auto clipCast(V const &val) -> T
Definition ClipCast.hpp:16
STL namespace.