Low-Level Abstraction of Memory Access
Simd.hpp
Go to the documentation of this file.
1 // Copyright 2022 Bernhard Manfred Gruber
2 // SPDX-License-Identifier: MPL-2.0
3 
4 #pragma once
5 
6 #include "Core.hpp"
7 #include "RecordRef.hpp"
8 #include "macros.hpp"
9 #include "mapping/AoS.hpp"
10 #include "mapping/AoSoA.hpp"
11 #include "mapping/SoA.hpp"
12 
13 #include <type_traits>
14 
15 namespace llama
16 {
31  template<typename Simd, typename SFINAE = void>
32  struct SimdTraits
33  {
34  static_assert(sizeof(Simd) == 0, "Please specialize SimdTraits for the type Simd");
35  };
36 
38  template<typename T>
39  struct SimdTraits<T, std::enable_if_t<std::is_arithmetic_v<T>>>
40  {
41  using value_type = T;
42 
43  inline static constexpr std::size_t lanes = 1;
44 
45  static LLAMA_FN_HOST_ACC_INLINE auto loadUnaligned(const T* mem) -> T
46  {
47  return *mem;
48  }
49 
50  static LLAMA_FN_HOST_ACC_INLINE void storeUnaligned(T t, T* mem)
51  {
52  *mem = t;
53  }
54 
55  static LLAMA_FN_HOST_ACC_INLINE auto gather(const value_type* mem, std::array<int, lanes> indices) -> T
56  {
57  return mem[indices[0]];
58  }
59 
60  static LLAMA_FN_HOST_ACC_INLINE void scatter(T t, value_type* mem, std::array<int, lanes> indices)
61  {
62  mem[indices[0]] = t;
63  }
64  };
65 
69  template<typename Simd, typename SFINAE = void>
70  inline constexpr auto simdLanes = SimdTraits<Simd>::lanes;
71 
77  template<typename RecordDim, template<typename> typename MakeSimd, typename BinaryReductionFunction>
78  LLAMA_CONSTEVAL auto chooseSimdLanes(BinaryReductionFunction reduce) -> std::size_t
79  {
80  using FRD = FlatRecordDim<RecordDim>;
81  std::size_t lanes = simdLanes<MakeSimd<mp_first<FRD>>>;
82  mp_for_each_inline<mp_transform<std::add_pointer_t, mp_drop_c<FRD, 1>>>(
83  [&](auto* t)
84  {
85  using T = std::remove_reference_t<decltype(*t)>;
86  lanes = reduce(lanes, simdLanes<MakeSimd<T>>);
87  });
88  assert(lanes > 0);
89  return lanes;
90  }
91 
99  template<typename RecordDim, template<typename> typename MakeSimd>
100  inline constexpr std::size_t simdLanesWithFullVectorsFor
101  = chooseSimdLanes<RecordDim, MakeSimd>([](auto a, auto b) { return std::max(a, b); });
102 
110  template<typename RecordDim, template<typename> typename MakeSimd>
111  inline constexpr std::size_t simdLanesWithLeastRegistersFor
112  = chooseSimdLanes<RecordDim, MakeSimd>([](auto a, auto b) { return std::min(a, b); });
113 
114  namespace internal
115  {
116  template<std::size_t N, template<typename, /* std::integral */ auto> typename MakeSizedSimd>
118  {
119  template<typename U>
120  using fn = MakeSizedSimd<U, N>;
121  };
122 
123  template<
124  typename RecordDim,
125  std::size_t N,
126  template<typename, /* std::integral */ auto>
127  typename MakeSizedSimd>
129  {
131  };
132 
133  template<typename RecordDim, template<typename, /* std::integral */ auto> typename MakeSizedSimd>
134  struct SimdizeNImpl<RecordDim, 1, MakeSizedSimd>
135  {
136  using type = RecordDim;
137  };
138  } // namespace internal
139 
144  template<typename RecordDim, std::size_t N, template<typename, /* std::integral */ auto> typename MakeSizedSimd>
146 
150  template<typename RecordDim, template<typename> typename MakeSimd>
152 
158  template<typename T, std::size_t N, template<typename, /* std::integral */ auto> typename MakeSizedSimd>
159  using SimdN = typename std::conditional_t<
160  isRecordDim<T>,
161  std::conditional_t<N == 1, mp_identity<One<T>>, mp_identity<One<SimdizeN<T, N, MakeSizedSimd>>>>,
162  std::conditional_t<N == 1, mp_identity<T>, mp_identity<SimdizeN<T, N, MakeSizedSimd>>>>::type;
163 
167  template<typename T, template<typename> typename MakeSimd>
168  using Simd = typename std::
169  conditional_t<isRecordDim<T>, mp_identity<One<Simdize<T, MakeSimd>>>, mp_identity<Simdize<T, MakeSimd>>>::type;
170 
171  namespace internal
172  {
173  template<std::size_t S>
174  struct SizeEqualTo
175  {
176  template<typename Simd>
177  using fn = std::bool_constant<simdLanes<Simd> == S>;
178  };
179  } // namespace internal
180 
184  template<typename Simd>
185  inline constexpr std::size_t simdLanes<Simd, std::enable_if_t<isRecordRef<Simd>>> = []
186  {
188  using FirstFieldType = mp_first<FRD>;
189  static_assert(mp_all_of_q<FRD, internal::SizeEqualTo<simdLanes<FirstFieldType>>>::value);
190  return simdLanes<FirstFieldType>;
191  }();
192 
193  namespace internal
194  {
195  template<typename AoSMapping, typename ElementType, std::size_t Lanes>
196  inline constexpr auto aosStridedIndices = []()
197  {
198  auto stride = flatSizeOf<
200  AoSMapping::fieldAlignment == llama::mapping::FieldAlignment::Align>
201  / sizeof(ElementType);
202  std::array<int, Lanes> indices{};
203  for(int i = 0; i < static_cast<int>(Lanes); i++)
204  indices[i] = i * stride;
205  return indices;
206  }();
207 
208  template<typename T, typename Simd, typename SrcRC, typename DstRC>
209  LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC)
210  {
212  using ElementSimd = std::decay_t<decltype(dstSimd(dstRC))>;
213  using Traits = SimdTraits<ElementSimd>;
214 
215  auto loadElementWise = [&]
216  {
217  auto b = ArrayIndexIterator{srcRef.view.extents(), srcRef.arrayIndex()};
218  for(std::size_t i = 0; i < Traits::lanes; i++)
219  reinterpret_cast<FieldType*>(&dstSimd(dstRC))[i]
220  = srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, srcRC));
221  };
222 
223  // TODO(bgruber): can we generalize the logic whether we can load a dstSimd from that mapping?
224  using Mapping = typename T::View::Mapping;
225  if constexpr(mapping::isSoA<Mapping>)
226  {
228  dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC));
230  }
231  else if constexpr(mapping::isAoSoA<typename T::View::Mapping>)
232  {
233  // TODO(bgruber): this check is too strict
234  if(T::View::Mapping::ArrayExtents::rank == 1 && srcRef.arrayIndex()[0] % Traits::lanes == 0
235  && T::View::Mapping::lanes >= Traits::lanes)
236  {
238  dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC));
240  }
241  else
242  loadElementWise();
243  }
244  else if constexpr(mapping::isAoS<Mapping>)
245  {
247  dstSimd(dstRC) = Traits::gather(&srcRef(srcRC), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
249  }
250  else
251  loadElementWise();
252  }
253 
254  template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
255  LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC)
256  {
257  using T = std::remove_reference_t<TFwd>;
259  using ElementSimd = std::decay_t<decltype(srcSimd(srcRC))>;
260  using Traits = SimdTraits<ElementSimd>;
261 
262  auto storeElementWise = [&]
263  {
264  // TODO(bgruber): how does this generalize conceptually to 2D and higher dimensions? in which
265  // direction should we collect SIMD values?
266  auto b = ArrayIndexIterator{dstRef.view.extents(), dstRef.arrayIndex()};
267  for(std::size_t i = 0; i < Traits::lanes; i++)
268  dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, dstRC))
269  = reinterpret_cast<const FieldType*>(&srcSimd(srcRC))[i];
270  };
271 
272  // TODO(bgruber): can we generalize the logic whether we can store a srcSimd to that mapping?
273  using Mapping = typename std::remove_reference_t<T>::View::Mapping;
274  if constexpr(mapping::isSoA<Mapping>)
275  {
277  Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC));
279  }
280  else if constexpr(mapping::isAoSoA<typename T::View::Mapping>)
281  {
282  // TODO(bgruber): this check is too strict
283  if(T::View::Mapping::ArrayExtents::rank == 1 && dstRef.arrayIndex()[0] % Traits::lanes == 0
284  && T::View::Mapping::lanes >= Traits::lanes)
285  {
287  Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC));
289  }
290  else
291  storeElementWise();
292  }
293  else if constexpr(mapping::isAoS<Mapping>)
294  {
296  Traits::scatter(srcSimd(srcRC), &dstRef(dstRC), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
298  }
299  else
300  storeElementWise();
301  }
302  } // namespace internal
303 
309  template<typename T, typename Simd>
310  LLAMA_FN_HOST_ACC_INLINE void loadSimd(const T& srcRef, Simd& dstSimd)
311  {
312  // structured dstSimd type and record reference
313  if constexpr(isRecordRef<Simd> && isRecordRef<T>)
314  {
315  if constexpr(simdLanes<Simd> == simdLanes<T>) // fast path mainly for scalar SimdN<T, 1, ...>
316  dstSimd = srcRef;
317  else
318  {
319  using SrcARD = typename T::AccessibleRecordDim;
320  using DstArd = typename Simd::AccessibleRecordDim;
321  if constexpr(std::is_same_v<SrcARD, DstArd>)
322  {
323  forEachLeafCoord<SrcARD>([&](auto rc) LLAMA_LAMBDA_INLINE
324  { internal::loadSimdFromField(srcRef, dstSimd, rc, rc); });
325  }
326  else
327  {
328  forEachLeafCoord<SrcARD>(
329  [&](auto srcRC) LLAMA_LAMBDA_INLINE
330  {
331  using SrcInnerCoord = decltype(srcRC);
332  forEachLeafCoord<DstArd>(
333  [&](auto dstRC) LLAMA_LAMBDA_INLINE
334  {
335  using DstInnerCoord = decltype(dstRC);
336  if constexpr(hasSameTags<SrcARD, SrcInnerCoord, DstArd, DstInnerCoord>)
337  {
338  internal::loadSimdFromField(srcRef, dstSimd, srcRC, dstRC);
339  }
340  });
341  });
342  }
343  }
344  }
345  // unstructured dstSimd and reference type
346  else if constexpr(!isRecordRef<Simd> && !isRecordRef<T>)
347  {
349  dstSimd = SimdTraits<Simd>::loadUnaligned(&srcRef);
351  }
352  else
353  {
354  // TODO(bgruber): when could we get here? Is this always an error?
355  static_assert(sizeof(Simd) == 0, "Invalid combination of Simd type and reference type");
356  }
357  }
358 
364  template<typename Simd, typename TFwd>
365  LLAMA_FN_HOST_ACC_INLINE void storeSimd(const Simd& srcSimd, TFwd&& dstRef)
366  {
367  using T = std::decay_t<TFwd>;
368  // structured Simd type and record reference
369  if constexpr(isRecordRef<Simd> && isRecordRef<T>)
370  {
371  if constexpr(simdLanes<Simd> == simdLanes<T>) // fast path mainly for scalar SimdN<T, 1, ...>
372  dstRef = srcSimd;
373  else
374  {
375  using SrcARD = typename Simd::AccessibleRecordDim;
376  using DstArd = typename T::AccessibleRecordDim;
377  if constexpr(std::is_same_v<SrcARD, DstArd>)
378  {
379  forEachLeafCoord<SrcARD>([&](auto rc) LLAMA_LAMBDA_INLINE
380  { internal::storeSimdToField(srcSimd, dstRef, rc, rc); });
381  }
382  else
383  {
384  forEachLeafCoord<SrcARD>(
385  [&](auto srcRC) LLAMA_LAMBDA_INLINE
386  {
387  using SrcInnerCoord = decltype(srcRC);
388  forEachLeafCoord<DstArd>(
389  [&](auto dstRC) LLAMA_LAMBDA_INLINE
390  {
391  using DstInnerCoord = decltype(dstRC);
392  if constexpr(hasSameTags<SrcARD, SrcInnerCoord, DstArd, DstInnerCoord>)
393  {
394  internal::storeSimdToField(srcSimd, dstRef, srcRC, dstRC);
395  }
396  });
397  });
398  }
399  }
400  }
401  // unstructured srcSimd and reference type
402  else if constexpr(!isRecordRef<Simd> && !isRecordRef<T>)
403  {
405  SimdTraits<Simd>::storeUnaligned(srcSimd, &dstRef);
407  }
408  else
409  {
410  // TODO(bgruber): when could we get here? Is this always an error?
411  static_assert(sizeof(Simd) == 0, "Invalid combination of Simd type and reference type");
412  }
413  }
414 
416  template<
417  std::size_t N,
418  template<typename, /* std::integral */ auto>
419  typename MakeSizedSimd,
420  typename View,
421  typename UnarySimdFunction>
422  void simdForEachN(View& view, UnarySimdFunction f)
423  {
424  using IndexType = typename View::Mapping::ArrayExtents::value_type;
425  const auto total = product(view.mapping().extents());
426  auto it = view.begin();
427  IndexType i = 0;
428  // simd loop
429  while(i + IndexType{N} <= total)
430  {
432  loadSimd(*it, simd);
433  if constexpr(std::is_void_v<decltype(f(simd))>)
434  f(simd);
435  else
436  storeSimd(f(simd), *it);
437  i += IndexType{N};
438  it += IndexType{N};
439  }
440  // tail
441  while(i < total)
442  {
443  auto scalar = One<typename View::RecordDim>{*it};
444  if constexpr(std::is_void_v<decltype(f(scalar))>)
445  f(scalar);
446  else
447  *it = f(scalar);
448  ++i;
449  ++it;
450  }
451  }
452 
454  template<
455  template<typename>
456  typename MakeSimd,
457  template<typename, /* std::integral */ auto>
458  typename MakeSizedSimd,
459  typename View,
460  typename UnarySimdFunction>
461  void simdForEach(View& view, UnarySimdFunction f)
462  {
463  constexpr auto n = llama::simdLanesWithFullVectorsFor<typename View::RecordDim, MakeSimd>;
464  simdForEachN<n, MakeSizedSimd>(view, f);
465  }
466 } // namespace llama
#define LLAMA_EXPORT
Definition: macros.hpp:192
#define LLAMA_LAMBDA_INLINE
Gives strong indication to the compiler to inline the attributed lambda.
Definition: macros.hpp:113
#define LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Definition: macros.hpp:141
#define LLAMA_CONSTEVAL
Expands to consteval if the compilers supports the keyword, otherwise to constexpr.
Definition: macros.hpp:186
#define LLAMA_FN_HOST_ACC_INLINE
Definition: macros.hpp:96
#define LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
Definition: macros.hpp:153
void loadSimdFromField(const T &srcRef, Simd &dstSimd, SrcRC srcRC, DstRC dstRC)
Definition: Simd.hpp:209
void storeSimdToField(const Simd &srcSimd, TFwd &&dstRef, SrcRC srcRC, DstRC dstRC)
Definition: Simd.hpp:255
constexpr auto aosStridedIndices
Definition: Simd.hpp:196
typename internal::FlattenRecordDimImpl< RecordDim >::type FlatRecordDim
Returns a flat type list containing all leaf field types of the given record dimension.
Definition: Core.hpp:481
typename std::conditional_t< isRecordDim< T >, std::conditional_t< N==1, mp_identity< One< T > >, mp_identity< One< SimdizeN< T, N, MakeSizedSimd > >> >, std::conditional_t< N==1, mp_identity< T >, mp_identity< SimdizeN< T, N, MakeSizedSimd > >> >::type SimdN
Definition: Simd.hpp:162
constexpr std::size_t simdLanesWithFullVectorsFor
Definition: Simd.hpp:101
constexpr auto cat(RecordCoords...)
Concatenate a set of RecordCoords instances.
void simdForEachN(View &view, UnarySimdFunction f)
Definition: Simd.hpp:422
void loadSimd(const T &srcRef, Simd &dstSimd)
Definition: Simd.hpp:310
TransformLeavesWithCoord< RecordDim, internal::MakePassSecond< FieldTypeFunctor >::template fn > TransformLeaves
Definition: Core.hpp:758
constexpr auto simdLanes
Definition: Simd.hpp:70
typename internal::SimdizeNImpl< RecordDim, N, MakeSizedSimd >::type SimdizeN
Definition: Simd.hpp:145
constexpr auto product(Array< T, N > a) -> T
Definition: Array.hpp:315
constexpr auto chooseSimdLanes(BinaryReductionFunction reduce) -> std::size_t
Definition: Simd.hpp:78
constexpr std::size_t flatSizeOf
The size of a type list if its elements would be in a normal struct.
Definition: Core.hpp:618
constexpr std::size_t simdLanesWithLeastRegistersFor
Definition: Simd.hpp:112
typename internal::GetTypeImpl< RecordDim, RecordCoordOrTags... >::type GetType
Definition: Core.hpp:388
void simdForEach(View &view, UnarySimdFunction f)
Definition: Simd.hpp:461
void storeSimd(const Simd &srcSimd, TFwd &&dstRef)
Definition: Simd.hpp:365
typename std::conditional_t< isRecordDim< T >, mp_identity< One< Simdize< T, MakeSimd > >>, mp_identity< Simdize< T, MakeSimd > >>::type Simd
Definition: Simd.hpp:169
TransformLeaves< RecordDim, MakeSimd > Simdize
Definition: Simd.hpp:151
Iterator supporting ArrayIndexRange.
static auto gather(const value_type *mem, std::array< int, lanes > indices) -> T
Definition: Simd.hpp:55
static void scatter(T t, value_type *mem, std::array< int, lanes > indices)
Definition: Simd.hpp:60
auto mapping() -> Mapping &
Definition: View.hpp:436
auto begin() -> iterator
Definition: View.hpp:542
MakeSizedSimd< U, N > fn
Definition: Simd.hpp:120
TransformLeaves< RecordDim, internal::BindMakeSizedSimd< N, MakeSizedSimd >::template fn > type
Definition: Simd.hpp:130
std::bool_constant< simdLanes< Simd >==S > fn
Definition: Simd.hpp:177