Grok 10.0.5
copy-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target include guard
17#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
21#else
22#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
23#endif
24
25#include "hwy/highway.h"
26
28namespace hwy {
29namespace HWY_NAMESPACE {
30
31// These functions avoid having to write a loop plus remainder handling in the
32// (unfortunately still common) case where arrays are not aligned/padded. If the
33// inputs are known to be aligned/padded, it is more efficient to write a single
34// loop using Load(). We do not provide a CopyAlignedPadded because it
35// would be more verbose than such a loop.
36
37// Fills `to`[0, `count`) with `value`.
38template <class D, typename T = TFromD<D>>
39void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
40 const size_t N = Lanes(d);
41 const Vec<D> v = Set(d, value);
42
43 size_t idx = 0;
44 for (; idx + N <= count; idx += N) {
45 StoreU(v, d, to + idx);
46 }
47
48 // `count` was a multiple of the vector length `N`: already done.
49 if (HWY_UNLIKELY(idx == count)) return;
50
51 const size_t remaining = count - idx;
52 HWY_DASSERT(0 != remaining && remaining < N);
53 SafeFillN(remaining, value, d, to + idx);
54}
55
56// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
57template <class D, typename T = TFromD<D>>
58void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
59 const size_t N = Lanes(d);
60
61 size_t idx = 0;
62 for (; idx + N <= count; idx += N) {
63 const Vec<D> v = LoadU(d, from + idx);
64 StoreU(v, d, to + idx);
65 }
66
67 // `count` was a multiple of the vector length `N`: already done.
68 if (HWY_UNLIKELY(idx == count)) return;
69
70 const size_t remaining = count - idx;
71 HWY_DASSERT(0 != remaining && remaining < N);
72 SafeCopyN(remaining, d, from + idx, to + idx);
73}
74
75// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
76// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
77// of the newly written elements in `to`.
78//
79// `func` is either a functor with a templated operator()(d, v) returning a
80// mask, or a generic lambda if using C++14. Due to apparent limitations of
81// Clang on Windows, it is currently necessary to add HWY_ATTR before the
82// opening { of the lambda to avoid errors about "function .. requires target".
83//
84// NOTE: this is only supported for 16-, 32- or 64-bit types.
85// NOTE: Func may be called a second time for elements it has already seen, but
86// these elements will not be written to `to` again.
87template <class D, class Func, typename T = TFromD<D>>
88T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
89 const Func& func) {
90 const size_t N = Lanes(d);
91
92 size_t idx = 0;
93 for (; idx + N <= count; idx += N) {
94 const Vec<D> v = LoadU(d, from + idx);
95 to += CompressBlendedStore(v, func(d, v), d, to);
96 }
97
98 // `count` was a multiple of the vector length `N`: already done.
99 if (HWY_UNLIKELY(idx == count)) return to;
100
101#if HWY_MEM_OPS_MIGHT_FAULT
102 // Proceed one by one.
103 const CappedTag<T, 1> d1;
104 for (; idx < count; ++idx) {
105 using V1 = Vec<decltype(d1)>;
106 // Workaround for -Waggressive-loop-optimizations on GCC 8
107 // (iteration 2305843009213693951 invokes undefined behavior for T=i64)
108 const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
109 const T* HWY_RESTRICT from_idx =
110 reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
111 const V1 v = LoadU(d1, from_idx);
112 // Avoid storing to `to` unless we know it should be kept - otherwise, we
113 // might overrun the end if it was allocated for the exact count.
114 if (CountTrue(d1, func(d1, v)) == 0) continue;
115 StoreU(v, d1, to);
116 to += 1;
117 }
118#else
119 // Start index of the last unaligned whole vector, ending at the array end.
120 const size_t last = count - N;
121 // Number of elements before `from` or already written.
122 const size_t invalid = idx - last;
123 HWY_DASSERT(0 != invalid && invalid < N);
124 const Mask<D> mask = Not(FirstN(d, invalid));
125 const Vec<D> v = MaskedLoad(mask, d, from + last);
126 to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
127#endif
128 return to;
129}
130
131// NOLINTNEXTLINE(google-readability-namespace-comments)
132} // namespace HWY_NAMESPACE
133} // namespace hwy
135
136#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_UNLIKELY(expr)
Definition base.h:76
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
d
Definition rvv-inl.h:1998
T * CopyIf(D d, const T *HWY_RESTRICT from, size_t count, T *HWY_RESTRICT to, const Func &func)
Definition copy-inl.h:88
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition ops/shared-inl.h:184
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
void Fill(D d, T value, size_t count, T *HWY_RESTRICT to)
Definition copy-inl.h:39
void Copy(D d, const T *HWY_RESTRICT from, size_t count, T *HWY_RESTRICT to)
Definition copy-inl.h:58
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:96
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:111
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:46
N
Definition rvv-inl.h:1998
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
const vfloat64m1_t v
Definition rvv-inl.h:1998
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:40
Definition aligned_allocator.h:27
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:105
#define HWY_NAMESPACE
Definition set_macros-inl.h:82