17 template<
typename RecordDim>
20 forEachLeafCoord<RecordDim>(
24 std::is_trivially_copyable_v<
GetType<RecordDim, decltype(rc)>>,
25 "All types in the record dimension must be trivially copyable");
32 = [](
void* dst,
const void* src, std::size_t size) {
std::memcpy(dst, src, size); };
34 template<
typename MemcpyFunc = decltype(memcpy)>
39 std::size_t threadId = 0,
40 std::size_t threadCount = 1,
41 MemcpyFunc singleThreadMemcpy =
memcpy)
43 const auto sizePerThread = size / threadCount;
44 const auto sizeLastThread = sizePerThread + size % threadCount;
45 const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread;
46 singleThreadMemcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread);
56 template<
typename Mapping,
typename SrcBlob,
typename DstBlob,
typename MemcpyFunc = decltype(
internal::memcpy)>
60 std::size_t threadId = 0,
61 std::size_t threadCount = 1,
64 internal::assertTrivialCopyable<typename Mapping::RecordDim>();
68 throw std::runtime_error{
"Array dimensions sizes are different"};
71 for(std::size_t i = 0; i < Mapping::blobCount; i++)
73 &dstView.
blobs()[i][0],
74 &srcView.
blobs()[i][0],
85 static_assert(std::is_trivially_copyable_v<std::remove_reference_t<decltype(*&src[0])>>);
86 static_assert(std::is_trivially_copyable_v<std::remove_reference_t<decltype(*&dst[0])>>);
108 throw std::runtime_error{
"Array dimensions sizes are different"};
109 for(std::size_t i = 0; i < Mapping::blobCount; i++)
117 template<
typename SrcMapping,
typename SrcBlob,
typename DstMapping,
typename DstBlob>
121 std::size_t threadId = 0,
122 std::size_t threadCount = 1)
126 std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
127 "The source and destination record dimensions must be the same");
130 throw std::runtime_error{
"Array dimensions sizes are different"};
135 { dstView(ai)(rc) = srcView(ai)(rc); });
138 constexpr
auto dims = SrcMapping::ArrayExtents::rank;
139 const auto extents = srcView.
extents().toArray();
140 const auto workPerThread = (extents[0] + threadCount - 1) / threadCount;
141 const auto start = threadId * workPerThread;
142 const auto end = std::min((threadId + 1) * workPerThread,
static_cast<std::size_t
>(extents[0]));
143 for(
auto i = start; i < end; i++)
145 using SrcSizeType =
typename SrcMapping::ArrayExtents::value_type;
146 if constexpr(dims > 1)
155 template<
typename Mapping>
163 typename LinearizeArrayIndexFunctor,
165 typename PermuteSBFields>
168 SoA<ArrayExtents, RecordDim, Blobs, SubArrayAlignment, LinearizeArrayIndexFunctor, PermuteSBFields>>
169 = std::numeric_limits<std::size_t>::max();
176 typename LinearizeArrayIndexFunctor,
178 typename PermuteFields>
179 inline constexpr std::size_t
180 aosoaLanes<mapping::AoSoA<ArrayExtents, RecordDim, Lanes, FA, LinearizeArrayIndexFunctor, PermuteFields>>
189 template<
typename SrcMapping,
typename SrcBlob,
typename DstMapping,
typename DstBlob>
193 std::size_t threadId = 0,
194 std::size_t threadCount = 1)
197 mapping::isAoSoA<SrcMapping> || mapping::isSoA<SrcMapping>,
198 "Only AoSoA and SoA mappings allowed as source");
200 mapping::isAoSoA<DstMapping> || mapping::isSoA<DstMapping>,
201 "Only AoSoA and SoA mappings allowed as destination");
205 std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
206 "The source and destination record dimensions must be the same");
209 typename SrcMapping::LinearizeArrayIndexFunctor,
210 typename DstMapping::LinearizeArrayIndexFunctor>,
211 "Source and destination mapping need to use the same array dimensions linearizer");
212 using RecordDim =
typename SrcMapping::RecordDim;
213 internal::assertTrivialCopyable<RecordDim>();
215 static constexpr
auto lanesSrc = internal::aosoaLanes<SrcMapping>;
216 static constexpr
auto lanesDst = internal::aosoaLanes<DstMapping>;
219 throw std::runtime_error{
"Array dimensions sizes are different"};
221 static constexpr
auto srcIsAoSoA = lanesSrc != std::numeric_limits<std::size_t>::max();
222 static constexpr
auto dstIsAoSoA = lanesDst != std::numeric_limits<std::size_t>::max();
224 static_assert(srcIsAoSoA || dstIsAoSoA,
"At least one of the mappings must be an AoSoA mapping");
225 static_assert(!srcIsAoSoA || SrcMapping::blobCount == 1,
"Implementation assumes AoSoA with single blob");
226 static_assert(!dstIsAoSoA || DstMapping::blobCount == 1,
"Implementation assumes AoSoA with single blob");
231 if(!srcIsAoSoA && flatSize % lanesDst != 0)
232 throw std::runtime_error{
"Source SoA mapping's total array elements must be evenly divisible by the "
233 "destination AoSoA Lane count."};
234 if(!dstIsAoSoA && flatSize % lanesSrc != 0)
235 throw std::runtime_error{
"Destination SoA mapping's total array elements must be evenly divisible by the "
236 "source AoSoA Lane count."};
240 const auto [blob, off] = srcView.
mapping().blobNrAndOffset(flatArrayIndex, rc);
241 return &srcView.
blobs()[blob][off];
245 const auto [blob, off] = dstView.
mapping().blobNrAndOffset(flatArrayIndex, rc);
246 return &dstView.
blobs()[blob][off];
249 static constexpr
auto l = []
251 if constexpr(srcIsAoSoA && dstIsAoSoA)
252 return std::gcd(lanesSrc, lanesDst);
253 return std::min(lanesSrc, lanesDst);
255 if constexpr(lanesSrc < lanesDst)
257 static_assert(srcIsAoSoA);
260 const auto elementsPerThread = flatSize / lanesSrc / threadCount * lanesSrc;
261 const auto start = threadId * elementsPerThread;
262 const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;
268 for(std::size_t i = start; i < stop; i += lanesSrc)
269 forEachLeafCoord<RecordDim>(
272 if constexpr(!packed)
274 for(std::size_t j = 0; j < lanesSrc; j += l)
276 assert(src == mapSrc(i + j, rc));
277 static constexpr
auto bytes = l *
sizeof(
GetType<RecordDim, decltype(rc)>);
285 static_assert(dstIsAoSoA);
288 const auto elementsPerThread = flatSize / lanesDst / threadCount * lanesDst;
289 const auto start = threadId * elementsPerThread;
290 const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;
296 for(std::size_t i = start; i < stop; i += lanesDst)
297 forEachLeafCoord<RecordDim>(
300 if constexpr(!packed)
302 for(std::size_t j = 0; j < lanesDst; j += l)
304 assert(dst == mapDst(i + j, rc));
305 constexpr
auto bytes = l *
sizeof(
GetType<RecordDim, decltype(rc)>);
318 template<
typename SrcMapping,
typename DstMapping,
typename SFINAE =
void>
321 template<
typename SrcView,
typename DstView>
322 void operator()(
const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount)
const
329 template<
typename Mapping>
332 template<
typename SrcView,
typename DstView>
333 void operator()(
const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount)
const
336 memcpyBlobs(srcView, dstView, threadId, threadCount);
344 typename LinearizeArrayIndex,
350 typename PermuteFields>
352 mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields>,
353 mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields>,
354 std::enable_if_t<LanesSrc != LanesDst>>
356 template<
typename SrcBlob,
typename DstBlob>
364 std::size_t threadId,
365 std::size_t threadCount)
375 typename LinearizeArrayIndex,
377 typename PermuteFields,
383 mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields>,
384 mapping::SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>>
386 template<
typename SrcBlob,
typename DstBlob>
393 SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
395 std::size_t threadId,
396 std::size_t threadCount)
406 typename LinearizeArrayIndex,
408 typename PermuteFields,
414 mapping::
SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
415 mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields>>
417 template<
typename SrcBlob,
typename DstBlob>
421 SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
426 std::size_t threadId,
427 std::size_t threadCount)
441 typename LinearizeArrayIndex,
443 typename PermuteFields>
445 mapping::
SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
446 mapping::SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
447 std::enable_if_t<SrcBlobs != DstBlobs || SrcSubArrayAlignment != DstSubArrayAlignment>>
449 template<
typename SrcBlob,
typename DstBlob>
453 SoA<ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
457 SoA<ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields>,
459 std::size_t threadId,
460 std::size_t threadCount)
462 if(srcView.extents() != dstView.extents())
463 throw std::runtime_error{
"Array dimensions sizes are different"};
465 const auto subArrayLength =
product(srcView.extents());
466 forEachLeafCoord<RecordDim>(
471 const auto [blob, off] = view.mapping().blobNrAndOffset(0, rc);
472 return &view.blobs()[blob][off];
475 subArrayStart(dstView, rc),
476 subArrayStart(srcView, rc),
477 subArrayLength *
sizeof(
GetType<RecordDim, decltype(rc)>),
490 template<
typename SrcMapping,
typename SrcBlob,
typename DstMapping,
typename DstBlob>
494 std::size_t threadId = 0,
495 std::size_t threadCount = 1)
#define LLAMA_LAMBDA_INLINE
Gives strong indication to the compiler to inline the attributed lambda.
constexpr auto copyBlobWithMemcpy
void parallelMemcpy(std::byte *dst, const std::byte *src, std::size_t size, std::size_t threadId=0, std::size_t threadCount=1, MemcpyFunc singleThreadMemcpy=memcpy)
constexpr std::size_t aosoaLanes
void assertTrivialCopyable()
SoA(TArrayExtents, TRecordDim) -> SoA< TArrayExtents, TRecordDim >
void forEachArrayIndex([[maybe_unused]] const ArrayIndex< SizeType, Dim > &extents, Func &&func, OuterIndices... outerIndices)
void copyBlobs(const View< Mapping, SrcBlob > &srcView, View< Mapping, DstBlob > &dstView, BlobCopyFunc copyBlob=internal::copyBlobWithMemcpy)
ArrayExtents(Args...) -> ArrayExtents< typename internal::IndexTypeFromArgs< std::size_t, Args... >::type,(Args{}, dyn)... >
void memcpyBlobs(const View< Mapping, SrcBlob > &srcView, View< Mapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1, MemcpyFunc singleThreadMemcpy=internal::memcpy)
void aosoaCommonBlockCopy(const View< SrcMapping, SrcBlob > &srcView, View< DstMapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1)
void copy(const View< SrcMapping, SrcBlob > &srcView, View< DstMapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1)
constexpr auto product(Array< T, N > a) -> T
typename internal::GetTypeImpl< RecordDim, RecordCoordOrTags... >::type GetType
void fieldWiseCopy(const View< SrcMapping, SrcBlob > &srcView, View< DstMapping, DstBlob > &dstView, std::size_t threadId=0, std::size_t threadCount=1)
void operator()(const SrcView &srcView, DstView &dstView, std::size_t threadId, std::size_t threadCount) const
llama::Copy< mapping::AoSoA< ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields >, mapping::AoSoA< ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields >, std::enable_if_t< LanesSrc !=LanesDst > >::operator() void operator()(const View< mapping::AoSoA< ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::AoSoA< ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
llama::Copy< mapping::AoSoA< ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields >, mapping::SoA< ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields > >::operator() void operator()(const View< mapping::AoSoA< ArrayExtents, RecordDim, LanesSrc, AlignSrc, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::SoA< ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
llama::Copy< mapping::SoA< ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, mapping::AoSoA< ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields > >::operator() void operator()(const View< mapping::SoA< ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::AoSoA< ArrayExtents, RecordDim, LanesDst, AlignDst, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
llama::Copy< mapping::SoA< ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, mapping::SoA< ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, std::enable_if_t< SrcBlobs !=DstBlobs||SrcSubArrayAlignment !=DstSubArrayAlignment > >::operator() void operator()(const View< mapping::SoA< ArrayExtents, RecordDim, SrcBlobs, SrcSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, SrcBlob > &srcView, View< mapping::SoA< ArrayExtents, RecordDim, DstBlobs, DstSubArrayAlignment, LinearizeArrayIndex, PermuteFields >, DstBlob > &dstView, std::size_t threadId, std::size_t threadCount)
Generic implementation of copy defaulting to fieldWiseCopy. LLAMA provides several specializations of...
void operator()(const SrcView &srcView, DstView &dstView, std::size_t threadId, std::size_t threadCount) const
auto mapping() -> Mapping &
auto blobs() -> Array< BlobType, Mapping::blobCount > &
auto extents() const -> ArrayExtents