alpaka
Abstraction Library for Parallel Kernel Acceleration
Vectorize.hpp
Go to the documentation of this file.
1 /* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
2  * SPDX-License-Identifier: MPL-2.0
3  */
4 
5 #pragma once
6 
7 #include "alpaka/core/Common.hpp"
8 
9 #include <cstddef>
10 #include <cstdint>
11 
12 //! Suggests vectorization of the directly following loop to the compiler.
13 //!
14 //! Usage:
15 //! `ALPAKA_VECTORIZE_HINT
16 //! for(...){...}`
17 // \TODO: Implement for other compilers.
18 // See: http://stackoverflow.com/questions/2706286/pragmas-swp-ivdep-prefetch-support-in-various-compilers
19 /*#if BOOST_COMP_HPACC
20  #define ALPAKA_VECTORIZE_HINT(...) _Pragma("ivdep")
21 #elif BOOST_COMP_PGI
22  #define ALPAKA_VECTORIZE_HINT(...) _Pragma("vector")
23 #elif BOOST_COMP_MSVC
24  #define ALPAKA_VECTORIZE_HINT(...) __pragma(loop(ivdep))
25 #elif BOOST_COMP_GNUC
26  #define ALPAKA_VECTORIZE_HINT(...) _Pragma("GCC ivdep")
27 #else
28  #define ALPAKA_VECTORIZE_HINT(...)
29 #endif*/
30 
32 {
33  // The alignment required to enable optimal performance dependant on the target architecture.
34  constexpr std::size_t defaultAlignment =
35 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__MIC__)
36  64u
37 #elif defined(__AVX__) || defined(__AVX2__)
38  32u
39 #else
40  16u
41 #endif
42  ;
43 
44  // Number of elements of the given type that can be processed in parallel in a vector register.
45  // By default there is no vectorization.
46  template<typename TElem>
48  {
49  static constexpr std::size_t value = 1u;
50  };
51 
52  // Number of elements of the given type that can be processed in parallel in a vector register.
53  template<>
55  {
56  static constexpr std::size_t value =
57 #if defined(__AVX512F__) || defined(__MIC__)
58  // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
59  // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
60  // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
61  8u;
62 #elif defined(__AVX__)
63  // addition (AVX): vaddpd / _mm256_add_pd
64  // subtraction (AVX): vsubpd / _mm256_sub_pd
65  // multiplication (AVX): vmulpd / _mm256_mul_pd
66  4u;
67 #elif defined(__SSE2__)
68  // addition (SSE2): addpd / _mm_add_pd
69  // subtraction (SSE2): subpd / _mm_sub_pd
70  // multiplication (SSE2): mulpd / _mm_mul_pd
71  2u;
72 #elif defined(__ARM_NEON__)
73  // No support for double precision vectorization!
74  1u;
75 #elif defined(__ALTIVEC__)
76  2u;
77 #else
78  1u;
79 #endif
80  };
81 
82  // Number of elements of the given type that can be processed in parallel in a vector register.
83  template<>
85  {
86  static constexpr std::size_t value =
87 #if defined(__AVX512F__) || defined(__MIC__)
88  // addition (AVX512F,KNC): vaddps / _mm512_add_ps
89  // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
90  // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
91  16u;
92 #elif defined(__AVX__)
93  // addition (AVX): vaddps / _mm256_add_ps
94  // subtraction (AVX): vsubps / _mm256_sub_ps
95  // multiplication (AVX): vmulps / _mm256_mul_ps
96  8u;
97 #elif defined(__SSE__)
98  // addition (SSE): addps / _mm_add_ps
99  // subtraction (SSE): subps / _mm_sub_ps
100  // multiplication (SSE): mulps / _mm_mul_ps
101  4u;
102 #elif defined(__ARM_NEON__)
103  4u;
104 #elif defined(__ALTIVEC__)
105  4u;
106 #else
107  1u;
108 #endif
109  };
110 
111  // Number of elements of the given type that can be processed in parallel in a vector register.
112  template<>
113  struct GetVectorizationSizeElems<std::int8_t>
114  {
115  static constexpr std::size_t value =
116 #if defined(__AVX512BW__)
117  // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
118  // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
119  // multiplication: -
120  64u;
121 #elif defined(__AVX2__)
122  // addition (AVX2): vpaddb / _mm256_add_epi8
123  // subtraction (AVX2): vpsubb / _mm256_sub_epi8
124  // multiplication: -
125  32u;
126 #elif defined(__SSE2__)
127  // addition (SSE2): paddb / _mm_add_epi8
128  // subtraction (SSE2): psubb / _mm_sub_epi8
129  // multiplication: -
130  16u;
131 #elif defined(__ARM_NEON__)
132  16u;
133 #elif defined(__ALTIVEC__)
134  16u;
135 #elif defined(__CUDA_ARCH__)
136  // addition: __vadd4
137  // subtraction: __vsub4
138  // multiplication: -
139  4u;
140 #else
141  1u;
142 #endif
143  };
144 
145  // Number of elements of the given type that can be processed in parallel in a vector register.
146  template<>
147  struct GetVectorizationSizeElems<std::uint8_t>
148  {
149  static constexpr std::size_t value =
150 #if defined(__AVX512BW__)
151  // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
152  // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
153  // multiplication: -
154  64u;
155 #elif defined(__AVX2__)
156  // addition (AVX2): vpaddb / _mm256_add_epi8
157  // subtraction (AVX2): vpsubb / _mm256_sub_epi8
158  // multiplication: -
159  32u;
160 #elif defined(__SSE2__)
161  // addition (SSE2): paddb / _mm_add_epi8
162  // subtraction (SSE2): psubb / _mm_sub_epi8
163  // multiplication: -
164  16u;
165 #elif defined(__ARM_NEON__)
166  16u;
167 #elif defined(__ALTIVEC__)
168  16u;
169 #elif defined(__CUDA_ARCH__)
170  // addition: __vadd4
171  // subtraction: __vsub4
172  // multiplication: -
173  4u;
174 #else
175  1u;
176 #endif
177  };
178 
179  // Number of elements of the given type that can be processed in parallel in a vector register.
180  template<>
181  struct GetVectorizationSizeElems<std::int16_t>
182  {
183  static constexpr std::size_t value =
184 #if defined(__AVX512BW__)
185  // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
186  // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
187  // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
188  32u;
189 #elif defined(__AVX2__)
190  // addition (AVX2): vpaddw / _mm256_add_epi16
191  // subtraction (AVX2): vpsubw / _mm256_sub_epi16
192  // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
193  16u;
194 #elif defined(__SSE2__)
195  // addition (SSE2): paddw / _mm_add_epi16
196  // subtraction (SSE2): psubw / _mm_sub_epi16
197  // multiplication (SSE2): pmullw / _mm_mullo_epi16
198  8u;
199 #elif defined(__ARM_NEON__)
200  8u;
201 #elif defined(__ALTIVEC__)
202  8u;
203 #elif defined(__CUDA_ARCH__)
204  // addition: __vadd2
205  // subtraction: __vsub2
206  // multiplication: -
207  2u;
208 #else
209  1u;
210 #endif
211  };
212 
213  // Number of elements of the given type that can be processed in parallel in a vector register.
214  template<>
215  struct GetVectorizationSizeElems<std::uint16_t>
216  {
217  static constexpr std::size_t value =
218 #if defined(__AVX512BW__)
219  // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
220  // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
221  // multiplication: ?
222  32u;
223 #elif defined(__AVX2__)
224  // addition (AVX2): vpaddusw / _mm256_adds_epu16
225  // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
226  // multiplication: ?
227  16u;
228 #elif defined(__SSE2__)
229  // addition (SSE2): paddusw / _mm_adds_epu16
230  // subtraction (SSE2): psubusw / _mm_subs_epu16
231  // multiplication: ?
232  8u;
233 #elif defined(__ARM_NEON__)
234  8u;
235 #elif defined(__ALTIVEC__)
236  8u;
237 #elif defined(__CUDA_ARCH__)
238  // addition: __vadd2
239  // subtraction: __vsub2
240  // multiplication: -
241  2u;
242 #else
243  1u;
244 #endif
245  };
246 
247  // Number of elements of the given type that can be processed in parallel in a vector register.
248  template<>
249  struct GetVectorizationSizeElems<std::int32_t>
250  {
251  static constexpr std::size_t value =
252 #if defined(__AVX512F__) || defined(__MIC__)
253  // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
254  // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
255  // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
256  16u;
257 #elif defined(__AVX2__)
258  // addition (AVX2): vpaddd / _mm256_add_epi32
259  // subtraction (AVX2): vpsubd / _mm256_sub_epi32
260  // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
261  8u;
262 #elif defined(__SSE2__)
263  // addition (SSE2): paddd / _mm_add_epi32
264  // subtraction (SSE2): psubd / _mm_sub_epi32
265  // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
266  4u;
267 #elif defined(__ARM_NEON__)
268  4u;
269 #elif defined(__ALTIVEC__)
270  4u;
271 #else
272  1u;
273 #endif
274  };
275 
276  // Number of elements of the given type that can be processed in parallel in a vector register.
277  template<>
278  struct GetVectorizationSizeElems<std::uint32_t>
279  {
280  static constexpr std::size_t value =
281 #if defined(__AVX512F__) || defined(__MIC__)
282  // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
283  // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
284  // multiplication: ?
285  16u;
286 #elif defined(__AVX2__)
287  // addition (AVX2): vpaddd / _mm256_add_epi32
288  // subtraction (AVX2): vpsubd / _mm256_sub_epi32
289  // multiplication: ?
290  8u;
291 #elif defined(__SSE2__)
292  // addition (SSE2): paddd / _mm_add_epi32
293  // subtraction (SSE2): psubd / _mm_sub_epi32
294  // multiplication: ?
295  4u;
296 #elif defined(__ARM_NEON__)
297  4u;
298 #elif defined(__ALTIVEC__)
299  4u;
300 #else
301  1u;
302 #endif
303  };
304 
305  // Number of elements of the given type that can be processed in parallel in a vector register.
306  template<>
307  struct GetVectorizationSizeElems<std::int64_t>
308  {
309  static constexpr std::size_t value =
310 #if defined(__AVX512F__)
311  // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
312  // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
313  // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
314  8u;
315 #elif defined(__AVX2__)
316  // addition (AVX2): vpaddq / _mm256_add_epi64
317  // subtraction (AVX2): vpsubq / _mm256_sub_epi64
318  // multiplication: -
319  4u;
320 #elif defined(__SSE2__)
321  // addition (SSE2): paddq / _mm_add_epi64
322  // subtraction (SSE2): psubq / _mm_sub_epi64
323  // multiplication: -
324  2u;
325 #elif defined(__ARM_NEON__)
326  2u;
327 #else
328  1u;
329 #endif
330  };
331 
332  // Number of elements of the given type that can be processed in parallel in a vector register.
333  template<>
334  struct GetVectorizationSizeElems<std::uint64_t>
335  {
336  static constexpr std::size_t value =
337 #if defined(__AVX512F__)
338  // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
339  // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
340  // multiplication: ?
341  8u;
342 #elif defined(__AVX2__)
343  // addition (AVX2): vpaddq / _mm256_add_epi64
344  // subtraction (AVX2): vpsubq / _mm256_sub_epi64
345  // multiplication: ?
346  4u;
347 #elif defined(__SSE2__)
348  // addition (SSE2): paddq / _mm_add_epi64
349  // subtraction (SSE2): psubq / _mm_sub_epi64
350  // multiplication: ?
351  2u;
352 #elif defined(__ARM_NEON__)
353  2u;
354 #else
355  1u;
356 #endif
357  };
358 } // namespace alpaka::core::vectorization
Suggests vectorization of the directly following loop to the compiler.
Definition: Vectorize.hpp:32
constexpr std::size_t defaultAlignment
Definition: Vectorize.hpp:34