Low-Level Abstraction of Memory Access
Copy.hpp
Go to the documentation of this file.
1 // Copyright 2021 Bernhard Manfred Gruber
2 // SPDX-License-Identifier: MPL-2.0
3 
4 #pragma once
5 
6 #include "View.hpp"
7 #include "mapping/AoSoA.hpp"
8 #include "mapping/SoA.hpp"
9 
10 #include <cstring>
11 #include <numeric>
12 
13 namespace llama
14 {
15  namespace internal
16  {
17  template<typename RecordDim>
19  {
20  forEachLeafCoord<RecordDim>(
21  [](auto rc)
22  {
23  static_assert(
24  std::is_trivially_copyable_v<GetType<RecordDim, decltype(rc)>>,
25  "All types in the record dimension must be trivially copyable");
26  });
27  }
28 
29  // need a custom memcpy symbol in LLAMA, because with clang+CUDA, there are multiple std::memcpy symbols, so
30  // the address is ambiguous.
31  inline constexpr auto memcpy
32  = [](void* dst, const void* src, std::size_t size) { std::memcpy(dst, src, size); };
33 
34  template<typename MemcpyFunc = decltype(memcpy)>
36  std::byte* dst,
37  const std::byte* src,
38  std::size_t size,
39  std::size_t threadId = 0,
40  std::size_t threadCount = 1,
41  MemcpyFunc singleThreadMemcpy = memcpy)
42  {
43  const auto sizePerThread = size / threadCount;
44  const auto sizeLastThread = sizePerThread + size % threadCount;
45  const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread;
46  singleThreadMemcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread);
47  }
48  } // namespace internal
49 
56  template<typename Mapping, typename SrcBlob, typename DstBlob, typename MemcpyFunc = decltype(internal::memcpy)>
58  const View<Mapping, SrcBlob>& srcView,
59  View<Mapping, DstBlob>& dstView,
60  std::size_t threadId = 0,
61  std::size_t threadCount = 1,
62  MemcpyFunc singleThreadMemcpy = internal::memcpy)
63  {
64  internal::assertTrivialCopyable<typename Mapping::RecordDim>();
65 
66  // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions
67  if(srcView.extents() != dstView.extents())
68  throw std::runtime_error{"Array dimensions sizes are different"};
69 
70  // TODO(bgruber): this is maybe not the best parallel copying strategy
71  for(std::size_t i = 0; i < Mapping::blobCount; i++)
73  &dstView.blobs()[i][0],
74  &srcView.blobs()[i][0],
75  dstView.mapping().blobSize(i),
76  threadId,
77  threadCount,
78  singleThreadMemcpy);
79  }
80 
81  namespace internal
82  {
83  inline constexpr auto copyBlobWithMemcpy = [](const auto& src, auto& dst, std::size_t size)
84  {
85  static_assert(std::is_trivially_copyable_v<std::remove_reference_t<decltype(*&src[0])>>);
86  static_assert(std::is_trivially_copyable_v<std::remove_reference_t<decltype(*&dst[0])>>);
87  std::memcpy(&dst[0], &src[0], size);
88  };
89  } // namespace internal
90 
96  template<
97  typename Mapping,
98  typename SrcBlob,
99  typename DstBlob,
100  typename BlobCopyFunc = decltype(internal::copyBlobWithMemcpy)>
101  void copyBlobs(
102  const View<Mapping, SrcBlob>& srcView,
103  View<Mapping, DstBlob>& dstView,
104  BlobCopyFunc copyBlob = internal::copyBlobWithMemcpy)
105  {
106  // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions
107  if(srcView.extents() != dstView.extents())
108  throw std::runtime_error{"Array dimensions sizes are different"};
109  for(std::size_t i = 0; i < Mapping::blobCount; i++)
110  copyBlob(srcView.blobs()[i], dstView.blobs()[i], dstView.mapping().blobSize(i));
111  }
112 
117  template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
119  const View<SrcMapping, SrcBlob>& srcView,
120  View<DstMapping, DstBlob>& dstView,
121  std::size_t threadId = 0,
122  std::size_t threadCount = 1)
123  {
124  // TODO(bgruber): think if we can remove this restriction
125  static_assert(
126  std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
127  "The source and destination record dimensions must be the same");
128 
129  if(srcView.extents() != dstView.extents())
130  throw std::runtime_error{"Array dimensions sizes are different"};
131 
132  auto copyOne = [&](auto ai) LLAMA_LAMBDA_INLINE
133  {
134  forEachLeafCoord<typename DstMapping::RecordDim>([&](auto rc) LLAMA_LAMBDA_INLINE
135  { dstView(ai)(rc) = srcView(ai)(rc); });
136  };
137 
138  constexpr auto dims = SrcMapping::ArrayExtents::rank;
139  const auto extents = srcView.extents().toArray();
140  const auto workPerThread = (extents[0] + threadCount - 1) / threadCount;
141  const auto start = threadId * workPerThread;
142  const auto end = std::min((threadId + 1) * workPerThread, static_cast<std::size_t>(extents[0]));
143  for(auto i = start; i < end; i++)
144  {
145  using SrcSizeType = typename SrcMapping::ArrayExtents::value_type;
146  if constexpr(dims > 1)
147  forEachArrayIndex(extents, copyOne, static_cast<SrcSizeType>(i));
148  else
149  copyOne(ArrayIndex<SrcSizeType, dims>{static_cast<std::size_t>(i)});
150  }
151  }
152 
153  namespace internal
154  {
155  template<typename Mapping>
156  inline constexpr std::size_t aosoaLanes = 1;
157 
158  template<
159  typename ArrayExtents,
160  typename RecordDim,
163  typename LinearizeArrayIndexFunctor,
164  template<typename>
165  typename PermuteSBFields>
166  inline constexpr std::size_t aosoaLanes<
167  mapping::
168  SoA<ArrayExtents, RecordDim, Blobs, SubArrayAlignment, LinearizeArrayIndexFunctor, PermuteSBFields>>
169  = std::numeric_limits<std::size_t>::max();
170 
171  template<
172  typename ArrayExtents,
173  typename RecordDim,
174  typename ArrayExtents::value_type Lanes,
176  typename LinearizeArrayIndexFunctor,
177  template<typename>
178  typename PermuteFields>
179  inline constexpr std::size_t
180  aosoaLanes<mapping::AoSoA<ArrayExtents, RecordDim, Lanes, FA, LinearizeArrayIndexFunctor, PermuteFields>>
181  = Lanes;
182  } // namespace internal
183 
189  template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
191  const View<SrcMapping, SrcBlob>& srcView,
192  View<DstMapping, DstBlob>& dstView,
193  std::size_t threadId = 0,
194  std::size_t threadCount = 1)
195  {
196  static_assert(
197  mapping::isAoSoA<SrcMapping> || mapping::isSoA<SrcMapping>,
198  "Only AoSoA and SoA mappings allowed as source");
199  static_assert(
200  mapping::isAoSoA<DstMapping> || mapping::isSoA<DstMapping>,
201  "Only AoSoA and SoA mappings allowed as destination");
202 
203  // TODO(bgruber): think if we can remove this restriction
204  static_assert(
205  std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
206  "The source and destination record dimensions must be the same");
207  static_assert(
208  std::is_same_v<
209  typename SrcMapping::LinearizeArrayIndexFunctor,
210  typename DstMapping::LinearizeArrayIndexFunctor>,
211  "Source and destination mapping need to use the same array dimensions linearizer");
212  using RecordDim = typename SrcMapping::RecordDim;
213  internal::assertTrivialCopyable<RecordDim>();
214 
215  static constexpr auto lanesSrc = internal::aosoaLanes<SrcMapping>;
216  static constexpr auto lanesDst = internal::aosoaLanes<DstMapping>;
217 
218  if(srcView.extents() != dstView.extents())
219  throw std::runtime_error{"Array dimensions sizes are different"};
220 
221  static constexpr auto srcIsAoSoA = lanesSrc != std::numeric_limits<std::size_t>::max();
222  static constexpr auto dstIsAoSoA = lanesDst != std::numeric_limits<std::size_t>::max();
223 
224  static_assert(srcIsAoSoA || dstIsAoSoA, "At least one of the mappings must be an AoSoA mapping");
225  static_assert(!srcIsAoSoA || SrcMapping::blobCount == 1, "Implementation assumes AoSoA with single blob");
226  static_assert(!dstIsAoSoA || DstMapping::blobCount == 1, "Implementation assumes AoSoA with single blob");
227 
228  const auto flatSize = product(dstView.extents());
229 
230  // TODO(bgruber): implement the following by adding additional copy loops for the remaining elements
231  if(!srcIsAoSoA && flatSize % lanesDst != 0)
232  throw std::runtime_error{"Source SoA mapping's total array elements must be evenly divisible by the "
233  "destination AoSoA Lane count."};
234  if(!dstIsAoSoA && flatSize % lanesSrc != 0)
235  throw std::runtime_error{"Destination SoA mapping's total array elements must be evenly divisible by the "
236  "source AoSoA Lane count."};
237 
238  auto mapSrc = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE
239  {
240  const auto [blob, off] = srcView.mapping().blobNrAndOffset(flatArrayIndex, rc);
241  return &srcView.blobs()[blob][off];
242  };
243  auto mapDst = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE
244  {
245  const auto [blob, off] = dstView.mapping().blobNrAndOffset(flatArrayIndex, rc);
246  return &dstView.blobs()[blob][off];
247  };
248 
249  static constexpr auto l = []
250  {
251  if constexpr(srcIsAoSoA && dstIsAoSoA)
252  return std::gcd(lanesSrc, lanesDst);
253  return std::min(lanesSrc, lanesDst);
254  }();
255  if constexpr(lanesSrc < lanesDst)
256  {
257  static_assert(srcIsAoSoA);
258 
259  // optimized for linear reading
260  const auto elementsPerThread = flatSize / lanesSrc / threadCount * lanesSrc;
261  const auto start = threadId * elementsPerThread;
262  const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;
263 
264  static constexpr auto packed = SrcMapping::fieldAlignment == mapping::FieldAlignment::Pack;
265  decltype(mapSrc(start, RecordCoord<>{})) src;
266  if constexpr(packed)
267  src = mapSrc(start, RecordCoord<>{});
268  for(std::size_t i = start; i < stop; i += lanesSrc)
269  forEachLeafCoord<RecordDim>(
270  [&](auto rc) LLAMA_LAMBDA_INLINE
271  {
272  if constexpr(!packed)
273  src = mapSrc(i, rc);
274  for(std::size_t j = 0; j < lanesSrc; j += l)
275  {
276  assert(src == mapSrc(i + j, rc));
277  static constexpr auto bytes = l * sizeof(GetType<RecordDim, decltype(rc)>);
278  std::memcpy(mapDst(i + j, rc), src, bytes);
279  src += bytes;
280  }
281  });
282  }
283  else
284  {
285  static_assert(dstIsAoSoA);
286 
287  // optimized for linear writing
288  const auto elementsPerThread = flatSize / lanesDst / threadCount * lanesDst;
289  const auto start = threadId * elementsPerThread;
290  const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;
291 
292  static constexpr auto packed = DstMapping::fieldAlignment == mapping::FieldAlignment::Pack;
293  decltype(mapDst(start, RecordCoord<>{})) dst;
294  if constexpr(packed)
295  dst = mapDst(start, RecordCoord<>{});
296  for(std::size_t i = start; i < stop; i += lanesDst)
297  forEachLeafCoord<RecordDim>(
298  [&](auto rc) LLAMA_LAMBDA_INLINE
299  {
300  if constexpr(!packed)
301  dst = mapDst(i, rc);
302  for(std::size_t j = 0; j < lanesDst; j += l)
303  {
304  assert(dst == mapDst(i + j, rc));
305  constexpr auto bytes = l * sizeof(GetType<RecordDim, decltype(rc)>);
306  std::memcpy(dst, mapSrc(i + j, rc), bytes);
307  dst += bytes;
308  }
309  });
310  }
311  }
312 
318  template<typename SrcMapping, typename DstMapping, typename SFINAE = void>
319  struct Copy
320  {
321  template<typename SrcView, typename DstView>
322  void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
323  {
324  fieldWiseCopy(srcView, dstView, threadId, threadCount);
325  }
326  };
327 
329  template<typename Mapping>
330  struct Copy<Mapping, Mapping>
331  {
332  template<typename SrcView, typename DstView>
333  void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
334  {
335  // FIXME(bgruber): need to fallback to fieldWiseCopy when elements are not trivially copyable
336  memcpyBlobs(srcView, dstView, threadId, threadCount);
337  }
338  };
339 
341  template<
342  typename ArrayExtents,
343  typename RecordDim,
344  typename LinearizeArrayIndex,
345  typename ArrayExtents::value_type LanesSrc,
346  typename ArrayExtents::value_type LanesDst,
347  mapping::FieldAlignment AlignSrc,
348  mapping::FieldAlignment AlignDst,
349  template<typename>
350  typename PermuteFields>
351  struct Copy<
352  mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields>,
353  mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields>,
354  std::enable_if_t<LanesSrc != LanesDst>>
355  {
356  template<typename SrcBlob, typename DstBlob>
358  const View<
360  SrcBlob>& srcView,
361  View<
363  DstBlob>& dstView,
364  std::size_t threadId,
365  std::size_t threadCount)
366  {
367  aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount);
368  }
369  };
370 
372  template<
373  typename ArrayExtents,
374  typename RecordDim,
375  typename LinearizeArrayIndex,
376  template<typename>
377  typename PermuteFields,
378  typename ArrayExtents::value_type LanesSrc,
379  mapping::FieldAlignment AlignSrc,
380  mapping::Blobs DstBlobs,
381  mapping::SubArrayAlignment DstSubArrayAlignment>
382  struct Copy<
383  mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields>,
384  mapping::SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>>
385  {
386  template<typename SrcBlob, typename DstBlob>
388  const View<
390  SrcBlob>& srcView,
391  View<
392  mapping::
393  SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
394  DstBlob>& dstView,
395  std::size_t threadId,
396  std::size_t threadCount)
397  {
398  aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount);
399  }
400  };
401 
403  template<
404  typename ArrayExtents,
405  typename RecordDim,
406  typename LinearizeArrayIndex,
407  template<typename>
408  typename PermuteFields,
409  typename ArrayExtents::value_type LanesDst,
410  mapping::FieldAlignment AlignDst,
411  mapping::Blobs SrcBlobs,
412  mapping::SubArrayAlignment SrcSubArrayAlignment>
413  struct Copy<
414  mapping::SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
415  mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields>>
416  {
417  template<typename SrcBlob, typename DstBlob>
419  const View<
420  mapping::
421  SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
422  SrcBlob>& srcView,
423  View<
425  DstBlob>& dstView,
426  std::size_t threadId,
427  std::size_t threadCount)
428  {
429  aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount);
430  }
431  };
432 
434  template<
435  typename ArrayExtents,
436  typename RecordDim,
437  mapping::Blobs SrcBlobs,
438  mapping::Blobs DstBlobs,
439  mapping::SubArrayAlignment SrcSubArrayAlignment,
440  mapping::SubArrayAlignment DstSubArrayAlignment,
441  typename LinearizeArrayIndex,
442  template<typename>
443  typename PermuteFields>
444  struct Copy<
445  mapping::SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
446  mapping::SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
447  std::enable_if_t<SrcBlobs != DstBlobs || SrcSubArrayAlignment != DstSubArrayAlignment>>
448  {
449  template<typename SrcBlob, typename DstBlob>
451  const View<
452  mapping::
453  SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
454  SrcBlob>& srcView,
455  View<
456  mapping::
457  SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
458  DstBlob>& dstView,
459  std::size_t threadId,
460  std::size_t threadCount)
461  {
462  if(srcView.extents() != dstView.extents())
463  throw std::runtime_error{"Array dimensions sizes are different"};
464 
465  const auto subArrayLength = product(srcView.extents());
466  forEachLeafCoord<RecordDim>(
467  [&](auto rc) LLAMA_LAMBDA_INLINE
468  {
469  auto subArrayStart = [&](auto& view, auto rc) LLAMA_LAMBDA_INLINE
470  {
471  const auto [blob, off] = view.mapping().blobNrAndOffset(0, rc);
472  return &view.blobs()[blob][off];
473  };
475  subArrayStart(dstView, rc),
476  subArrayStart(srcView, rc),
477  subArrayLength * sizeof(GetType<RecordDim, decltype(rc)>),
478  threadId,
479  threadCount);
480  });
481  }
482  };
483 
490  template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
491  void copy(
492  const View<SrcMapping, SrcBlob>& srcView,
493  View<DstMapping, DstBlob>& dstView,
494  std::size_t threadId = 0,
495  std::size_t threadCount = 1)
496  {
497  Copy<SrcMapping, DstMapping>{}(srcView, dstView, threadId, threadCount);
498  }
499 } // namespace llama
#define LLAMA_EXPORT
Definition: macros.hpp:192
#define LLAMA_LAMBDA_INLINE
Gives strong indication to the compiler to inline the attributed lambda.
Definition: macros.hpp:113
constexpr auto copyBlobWithMemcpy
Definition: Copy.hpp:83
void parallelMemcpy(std::byte *dst, const std::byte *src, std::size_t size, std::size_t threadId=0, std::size_t threadCount=1, MemcpyFunc singleThreadMemcpy=memcpy)
Definition: Copy.hpp:35
constexpr std::size_t aosoaLanes
Definition: Copy.hpp:156
void assertTrivialCopyable()
Definition: Copy.hpp:18
constexpr auto memcpy
Definition: Copy.hpp:32
SoA(TArrayExtents, TRecordDim) -> SoA< TArrayExtents, TRecordDim >
SubArrayAlignment
Definition: SoA.hpp:21
void forEachArrayIndex([[maybe_unused]] const ArrayIndex< SizeType, Dim > &extents, Func &&func, OuterIndices... outerIndices)
void copyBlobs(const View< Mapping, SrcBlob > &srcView, View< Mapping, DstBlob > &dstView, BlobCopyFunc copyBlob=internal::copyBlobWithMemcpy)
Definition: Copy.hpp:101
ArrayExtents(Args...) -> ArrayExtents< typename internal::IndexTypeFromArgs< std::size_t, Args... >::type,(Args{}, dyn)... >
void memcpyBlobs(const View< Mapping, SrcBlob > &srcView, View< Mapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1, MemcpyFunc singleThreadMemcpy=internal::memcpy)
Definition: Copy.hpp:57
void aosoaCommonBlockCopy(const View< SrcMapping, SrcBlob > &srcView, View< DstMapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1)
Definition: Copy.hpp:190
void copy(const View< SrcMapping, SrcBlob > &srcView, View< DstMapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1)
Definition: Copy.hpp:491
constexpr auto product(Array< T, N > a) -> T
Definition: Array.hpp:315
typename internal::GetTypeImpl< RecordDim, RecordCoordOrTags... >::type GetType
Definition: Core.hpp:388
void fieldWiseCopy(const View< SrcMapping, SrcBlob > &srcView, View< DstMapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1)
Definition: Copy.hpp:118
void operator()(const SrcView &srcView, DstView &dstView, std::size_t threadId, std::size_t threadCount) const
Definition: Copy.hpp:333
void operator()(const View< mapping::AoSoA< ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::AoSoA< ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
Definition: Copy.hpp:357
void operator()(const View< mapping::AoSoA< ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::SoA< ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
Definition: Copy.hpp:387
void operator()(const View< mapping::SoA< ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::AoSoA< ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
Definition: Copy.hpp:418
void operator()(const View< mapping::SoA< ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::SoA< ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
Definition: Copy.hpp:450
Generic implementation of copy defaulting to fieldWiseCopy. LLAMA provides several specializations of...
Definition: Copy.hpp:320
void operator()(const SrcView &srcView, DstView &dstView, std::size_t threadId, std::size_t threadCount) const
Definition: Copy.hpp:322
auto mapping() -> Mapping &
Definition: View.hpp:436
auto blobs() -> Array< BlobType, Mapping::blobCount > &
Definition: View.hpp:565
auto extents() const -> ArrayExtents
Definition: View.hpp:456