59 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
61 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
62 const Vec128<type##_t, size> c
86 name(
HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
87 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
88 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
98#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
99 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
100 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \
101 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \
102 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \
103 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
106#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
107 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
108 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \
109 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \
110 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \
111 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
114#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
115 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
116 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \
117 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \
118 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
121#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
122 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
123 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \
124 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \
125 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
128#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
129 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
130 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \
131 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
134#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
135 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
136 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \
137 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
140#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
141 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
142 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
145#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
146 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
147 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
150#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
151 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
152 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \
153 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
157#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \
158 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
159 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
161#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
166#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
167 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
168 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
172#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
173 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
174 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
175 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
178#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
179 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
180 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
181 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
184#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
185 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
186 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
189#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
190 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
191 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
194#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
195 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
196 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
199#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
200 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
201 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
203#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
204 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
205 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
206 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
209#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \
210 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
211 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
212 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
213 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
214 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
215 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
216 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
217 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
221#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
222#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
223#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
224#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
225#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
226#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
227#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
228#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
229#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
230#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
231#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
232#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
233#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
234#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
235#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
236#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
237#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
238#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
239#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
240#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
241#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
242#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
243#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
244#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
245#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
246#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
247#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
248#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
249#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
250#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
251#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
252#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
253#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
254#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
255#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
256#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
257#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
258#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
259#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
260#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
261#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
262#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
263#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
264#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
265#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
266#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
267#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
268#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
269#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
270#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
271#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
272#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
273#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
274#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
275#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
276#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
282template <
typename T,
size_t N>
284template <
typename T,
size_t N>
286template <
typename T,
size_t N>
592template <
typename T,
size_t N>
654 using type = float64x2_t;
717 using type = float64x1_t;
777template <
typename T,
size_t N = 16 /
sizeof(T)>
783 static constexpr size_t kPrivateN =
N;
793 return *
this = (*
this * other);
796 return *
this = (*
this / other);
799 return *
this = (*
this + other);
802 return *
this = (*
this - other);
805 return *
this = (*
this & other);
808 return *
this = (*
this | other);
811 return *
this = (*
this ^ other);
824template <
typename T,
size_t N = 16 /
sizeof(T)>
853#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
854#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
855 Vec128<uint8_t, size * sizeof(type##_t)>
856#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
857#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
882#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
883#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
884#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
885#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
895template <
size_t N, HWY_IF_LE64(
int8_t, N)>
900template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
905template <
size_t N, HWY_IF_LE64(
int16_t, N)>
910template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
915template <
size_t N, HWY_IF_LE64(
int32_t, N)>
920template <
size_t N, HWY_IF_LE64(
float, N)>
996template <
typename T,
size_t N,
typename FromT>
998 Vec128<FromT, N *
sizeof(T) /
sizeof(FromT)> v) {
999 return detail::BitCastFromByte(
d, detail::BitCastToByte(
v));
1005#define HWY_NEON_BUILD_TPL_HWY_SET1
1006#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
1007#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
1008 Simd<type##_t, size, 0> , const type##_t t
1009#define HWY_NEON_BUILD_ARG_HWY_SET1 t
1013#undef HWY_NEON_BUILD_TPL_HWY_SET1
1014#undef HWY_NEON_BUILD_RET_HWY_SET1
1015#undef HWY_NEON_BUILD_PARAM_HWY_SET1
1016#undef HWY_NEON_BUILD_ARG_HWY_SET1
1019template <
typename T,
size_t N>
1039template <typename T, size_t
N>
1048template <
typename T,
size_t N,
typename T2>
1051 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
1055 return Load(
d, lanes);
1061#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
1062#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
1063#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
1064#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
1068#undef HWY_NEON_BUILD_TPL_HWY_GET
1069#undef HWY_NEON_BUILD_RET_HWY_GET
1070#undef HWY_NEON_BUILD_PARAM_HWY_GET
1071#undef HWY_NEON_BUILD_ARG_HWY_GET
1077 return detail::GetLane<0>(
v);
1084template <
typename T>
1088 return detail::GetLane<0>(
v);
1091template <
typename T>
1093#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1094 if (__builtin_constant_p(i)) {
1097 return detail::GetLane<0>(
v);
1099 return detail::GetLane<1>(
v);
1103 alignas(16) T lanes[2];
1108template <
typename T>
1110#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1111 if (__builtin_constant_p(i)) {
1114 return detail::GetLane<0>(
v);
1116 return detail::GetLane<1>(
v);
1118 return detail::GetLane<2>(
v);
1120 return detail::GetLane<3>(
v);
1124 alignas(16) T lanes[4];
1129template <
typename T>
1131#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1132 if (__builtin_constant_p(i)) {
1135 return detail::GetLane<0>(
v);
1137 return detail::GetLane<1>(
v);
1139 return detail::GetLane<2>(
v);
1141 return detail::GetLane<3>(
v);
1143 return detail::GetLane<4>(
v);
1145 return detail::GetLane<5>(
v);
1147 return detail::GetLane<6>(
v);
1149 return detail::GetLane<7>(
v);
1153 alignas(16) T lanes[8];
1158template <
typename T>
1160#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1161 if (__builtin_constant_p(i)) {
1164 return detail::GetLane<0>(
v);
1166 return detail::GetLane<1>(
v);
1168 return detail::GetLane<2>(
v);
1170 return detail::GetLane<3>(
v);
1172 return detail::GetLane<4>(
v);
1174 return detail::GetLane<5>(
v);
1176 return detail::GetLane<6>(
v);
1178 return detail::GetLane<7>(
v);
1180 return detail::GetLane<8>(
v);
1182 return detail::GetLane<9>(
v);
1184 return detail::GetLane<10>(
v);
1186 return detail::GetLane<11>(
v);
1188 return detail::GetLane<12>(
v);
1190 return detail::GetLane<13>(
v);
1192 return detail::GetLane<14>(
v);
1194 return detail::GetLane<15>(
v);
1198 alignas(16) T lanes[16];
1206#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
1207#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
1208#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
1209 Vec128<type##_t, size> v, type##_t t
1210#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
1214#undef HWY_NEON_BUILD_TPL_HWY_INSERT
1215#undef HWY_NEON_BUILD_RET_HWY_INSERT
1216#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
1217#undef HWY_NEON_BUILD_ARG_HWY_INSERT
1224template <
typename T>
1231template <
typename T>
1233#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1234 if (__builtin_constant_p(i)) {
1237 return detail::InsertLane<0>(
v, t);
1239 return detail::InsertLane<1>(
v, t);
1244 alignas(16) T lanes[2];
1247 return Load(
d, lanes);
1250template <
typename T>
1252#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1253 if (__builtin_constant_p(i)) {
1256 return detail::InsertLane<0>(
v, t);
1258 return detail::InsertLane<1>(
v, t);
1260 return detail::InsertLane<2>(
v, t);
1262 return detail::InsertLane<3>(
v, t);
1267 alignas(16) T lanes[4];
1270 return Load(
d, lanes);
1273template <
typename T>
1275#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1276 if (__builtin_constant_p(i)) {
1279 return detail::InsertLane<0>(
v, t);
1281 return detail::InsertLane<1>(
v, t);
1283 return detail::InsertLane<2>(
v, t);
1285 return detail::InsertLane<3>(
v, t);
1287 return detail::InsertLane<4>(
v, t);
1289 return detail::InsertLane<5>(
v, t);
1291 return detail::InsertLane<6>(
v, t);
1293 return detail::InsertLane<7>(
v, t);
1298 alignas(16) T lanes[8];
1301 return Load(
d, lanes);
1304template <
typename T>
1306#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC
1307 if (__builtin_constant_p(i)) {
1310 return detail::InsertLane<0>(
v, t);
1312 return detail::InsertLane<1>(
v, t);
1314 return detail::InsertLane<2>(
v, t);
1316 return detail::InsertLane<3>(
v, t);
1318 return detail::InsertLane<4>(
v, t);
1320 return detail::InsertLane<5>(
v, t);
1322 return detail::InsertLane<6>(
v, t);
1324 return detail::InsertLane<7>(
v, t);
1326 return detail::InsertLane<8>(
v, t);
1328 return detail::InsertLane<9>(
v, t);
1330 return detail::InsertLane<10>(
v, t);
1332 return detail::InsertLane<11>(
v, t);
1334 return detail::InsertLane<12>(
v, t);
1336 return detail::InsertLane<13>(
v, t);
1338 return detail::InsertLane<14>(
v, t);
1340 return detail::InsertLane<15>(
v, t);
1345 alignas(16) T lanes[16];
1348 return Load(
d, lanes);
1424#pragma push_macro("HWY_NEON_DEF_FUNCTION")
1425#undef HWY_NEON_DEF_FUNCTION
1426#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
1427 template <int kBits> \
1428 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \
1429 return kBits == 0 ? v \
1430 : Vec128<type##_t, size>(HWY_NEON_EVAL( \
1431 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
1439#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
1443template <
int kBits,
size_t N>
1445 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1446 if (kBits == 0)
return v;
1450template <
int kBits,
size_t N>
1452 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1453 if (kBits == 0)
return v;
1466template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1476template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1486template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1505template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1515template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1525template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1547template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
1559template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1571template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1593template <
size_t N, HWY_IF_LE64(
int8_t, N)>
1603template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1613template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1630template <
typename T,
size_t N>
1632 return v << Set(Simd<T, N, 0>(),
static_cast<T
>(bits));
1634template <
typename T,
size_t N>
1651template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1656template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
1672template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1677template <
size_t N, HWY_IF_LE64(
int32_t, N)>
1686 int32x4_t rlo = vmull_s16(vget_low_s16(a.
raw), vget_low_s16(b.
raw));
1688 int32x4_t rhi = vmull_high_s16(a.
raw, b.
raw);
1690 int32x4_t rhi = vmull_s16(vget_high_s16(a.
raw), vget_high_s16(b.
raw));
1693 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
1697 uint32x4_t rlo = vmull_u16(vget_low_u16(a.
raw), vget_low_u16(b.
raw));
1699 uint32x4_t rhi = vmull_high_u16(a.
raw, b.
raw);
1701 uint32x4_t rhi = vmull_u16(vget_high_u16(a.
raw), vget_high_u16(b.
raw));
1704 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
1707template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1710 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.
raw, b.
raw));
1713template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
1716 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.
raw, b.
raw));
1723template <
size_t N, HWY_IF_LE64(
int16_t, N)>
1764 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
1765 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
1766 x *= detail::ReciprocalNewtonRaphsonStep(x, b);
1776template <
size_t N, HWY_IF_LE64(
float, N)>
1785#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1786template <
size_t N, HWY_IF_LE64(
float, N)>
1787HWY_API Vec128<float, N> MulAdd(
const Vec128<float, N> mul,
1788 const Vec128<float, N> x,
1789 const Vec128<float, N> add) {
1790 return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
1792HWY_API Vec128<float> MulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1793 const Vec128<float> add) {
1794 return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
1802 return mul * x + add;
1807HWY_API Vec64<double> MulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1808 const Vec64<double> add) {
1809 return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
1811HWY_API Vec128<double> MulAdd(
const Vec128<double> mul,
const Vec128<double> x,
1812 const Vec128<double> add) {
1813 return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
1818#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
1819template <
size_t N, HWY_IF_LE64(
float, N)>
1821 const Vec128<float, N> x,
1822 const Vec128<float, N> add) {
1823 return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
1825HWY_API Vec128<float>
NegMulAdd(
const Vec128<float> mul,
const Vec128<float> x,
1826 const Vec128<float> add) {
1827 return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
1835 return add - mul * x;
1840HWY_API Vec64<double> NegMulAdd(
const Vec64<double> mul,
const Vec64<double> x,
1841 const Vec64<double> add) {
1842 return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
1844HWY_API Vec128<double> NegMulAdd(
const Vec128<double> mul,
1845 const Vec128<double> x,
1846 const Vec128<double> add) {
1847 return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
1869HWY_API Vec128<double, N> MulSub(
const Vec128<double, N> mul,
1870 const Vec128<double, N> x,
1871 const Vec128<double, N> sub) {
1872 return MulAdd(mul, x, Neg(sub));
1875HWY_API Vec128<double, N> NegMulSub(
const Vec128<double, N> mul,
1876 const Vec128<double, N> x,
1877 const Vec128<double, N> sub) {
1878 return Neg(MulAdd(mul, x, sub));
1916 recip *= detail::ReciprocalSqrtStep(
v * recip, recip);
1917 recip *= detail::ReciprocalSqrtStep(
v * recip, recip);
1918 recip *= detail::ReciprocalSqrtStep(
v * recip, recip);
1920 const auto root =
v * recip;
1930template <
typename T>
1936template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1940 using V8 =
decltype(
Zero(d8));
1963template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1966 return detail::reversed_andnot(mask, not_mask);
1970template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1971HWY_API Vec128<T, N> AndNot(
const Vec128<T, N> not_mask,
1972 const Vec128<T, N> mask) {
1973 const DFromV<
decltype(mask)> d;
1974 const RebindToUnsigned<
decltype(d)> du;
1975 VFromD<
decltype(du)> ret =
1976 detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
1977 return BitCast(d, ret);
2005#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
2011HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
2012 return Xor(x1, Xor(x2, x3));
2015template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2016HWY_API Vec128<T, N> Xor3(
const Vec128<T, N> x1,
const Vec128<T, N> x2,
2017 const Vec128<T, N> x3) {
2018 const DFromV<
decltype(x1)> d;
2019 const RebindToUnsigned<
decltype(d)> du;
2020 return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
2024template <
typename T,
size_t N>
2026 return Xor(x1,
Xor(x2, x3));
2032template <
typename T,
size_t N>
2034 return Or(o1,
Or(o2, o3));
2039template <
typename T,
size_t N>
2041 return Or(o,
And(a1, a2));
2046template <
typename T,
size_t N>
2054template <
typename T,
size_t N>
2059template <
typename T,
size_t N>
2064template <
typename T,
size_t N>
2071#ifdef HWY_NATIVE_POPCNT
2072#undef HWY_NATIVE_POPCNT
2074#define HWY_NATIVE_POPCNT
2079template <
typename T>
2084template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2092template <
typename T>
2095 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2098template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2102 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2106template <
typename T>
2109 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2110 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
2112template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2116 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2120template <
typename T>
2123 const uint8x16_t bytes = vcntq_u8(
BitCast(d8,
v).raw);
2124 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
2126template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2130 const uint8x8_t bytes = vcnt_u8(
BitCast(d8,
v).raw);
2131 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
2136template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
2138 return detail::PopulationCount(
hwy::SizeTag<
sizeof(T)>(),
v);
2160template <
size_t N, HWY_IF_LE64(
int8_t, N)>
2164template <
size_t N, HWY_IF_LE64(
int16_t, N)>
2168template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2172template <
size_t N, HWY_IF_LE64(
float, N)>
2178HWY_API Vec128<double> Abs(
const Vec128<double> v) {
2179 return Vec128<double>(vabsq_f64(v.raw));
2182HWY_API Vec64<double> Abs(
const Vec64<double> v) {
2183 return Vec64<double>(vabs_f64(v.raw));
2189template <
typename T,
size_t N>
2192 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2197template <
typename T,
size_t N>
2200 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
2206template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
2216template <
typename T,
size_t N>
2222template <
typename T,
size_t N>
2229template <
typename TFrom,
typename TTo,
size_t N>
2231 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
2237#define HWY_NEON_BUILD_TPL_HWY_IF
2238#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
2239#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
2240 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
2241 const Vec128<type##_t, size> no
2242#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
2246#undef HWY_NEON_BUILD_TPL_HWY_IF
2247#undef HWY_NEON_BUILD_RET_HWY_IF
2248#undef HWY_NEON_BUILD_PARAM_HWY_IF
2249#undef HWY_NEON_BUILD_ARG_HWY_IF
2252template <
typename T,
size_t N>
2259template <
typename T,
size_t N>
2265template <
typename T,
size_t N>
2268 static_assert(IsSigned<T>(),
"Only works for signed/float");
2276template <
typename T,
size_t N>
2279 return Max(zero,
v);
2284template <
typename T,
size_t N>
2289template <
typename T,
size_t N>
2295template <
typename T,
size_t N>
2301template <
typename T,
size_t N>
2307template <
typename T,
size_t N>
2313template <
typename T,
size_t N>
2345#define HWY_NEON_BUILD_TPL_HWY_COMPARE
2346#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
2347#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
2348 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
2349#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
2373#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
2374#undef HWY_NEON_BUILD_RET_HWY_COMPARE
2375#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
2376#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
2383HWY_API Mask128<int64_t, N> operator==(
const Vec128<int64_t, N> a,
2384 const Vec128<int64_t, N> b) {
2385 const Simd<int32_t, N * 2, 0> d32;
2386 const Simd<int64_t, N, 0> d64;
2387 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
2388 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
2389 return MaskFromVec(BitCast(d64, cmp64));
2394 const Vec128<uint64_t, N> b) {
2395 const Simd<uint32_t, N * 2, 0> d32;
2396 const Simd<uint64_t, N, 0> d64;
2403 const Vec128<int64_t> b) {
2404 const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
2408 const Vec64<int64_t> b) {
2409 const int64x1_t sub = vqsub_s64(a.raw, b.raw);
2415 const Vec128<uint64_t, N> b) {
2416 const DFromV<
decltype(a)> du;
2418 const Vec128<uint64_t, N> msb =
AndNot(a, b) |
AndNot(a ^ b, a - b);
2427#pragma push_macro("HWY_NEON_DEF_FUNCTION")
2428#undef HWY_NEON_DEF_FUNCTION
2432#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
2433 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \
2434 Vec128<type##_t, size> b) { \
2435 return Not(a == b); \
2440#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
2444template <
typename T,
size_t N>
2448template <
typename T,
size_t N>
2455template <
typename T,
size_t N>
2463#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
2464#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
2465#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
2466 Vec128<type##_t, size> v, Vec128<type##_t, size> bit
2467#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
2479 return (
v & bit) == bit;
2484 return (
v & bit) == bit;
2488#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
2489#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
2490#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
2491#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
2518 const
Vec128<uint64_t, N> b) {
2522 const DFromV<
decltype(a)> du;
2533 const
Vec128<int64_t, N> b) {
2556 const
Vec128<uint64_t, N> b) {
2560 const DFromV<
decltype(a)> du;
2571 const
Vec128<int64_t, N> b) {
2628HWY_API Vec128<double> LoadU(Full128<double> ,
2630 return Vec128<double>(vld1q_f64(unaligned));
2673HWY_API Vec64<double> LoadU(Full64<double> ,
2675 return Vec64<double>(vld1_f64(p));
2694template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
2698 CopyBytes<4>(p, &buf);
2715template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2719 CopyBytes<2>(p, &buf);
2740 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2747 const auto pu16 =
reinterpret_cast<const uint16_t*
>(p);
2752template <
typename T,
size_t N>
2757template <
typename T,
size_t N>
2764template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2774 vst1q_u8(unaligned,
v.raw);
2778 vst1q_u16(unaligned,
v.raw);
2782 vst1q_u32(unaligned,
v.raw);
2786 vst1q_u64(unaligned,
v.raw);
2790 vst1q_s8(unaligned,
v.raw);
2794 vst1q_s16(unaligned,
v.raw);
2798 vst1q_s32(unaligned,
v.raw);
2802 vst1q_s64(unaligned,
v.raw);
2806 vst1q_f32(unaligned,
v.raw);
2809HWY_API void StoreU(
const Vec128<double> v, Full128<double> ,
2811 vst1q_f64(unaligned, v.raw);
2854HWY_API void StoreU(
const Vec64<double> v, Full64<double> ,
2864 vst1_lane_u32(p,
v.raw, 0);
2868 vst1_lane_s32(p,
v.raw, 0);
2872 vst1_lane_f32(p,
v.raw, 0);
2875template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x6)>
2879 CopyBytes<4>(&buf, p);
2886 vst1_lane_u16(p,
v.raw, 0);
2890 vst1_lane_s16(p,
v.raw, 0);
2893template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2897 CopyBytes<2>(&buf, p);
2904 vst1_lane_u8(p,
v.raw, 0);
2908 vst1_lane_s8(p,
v.raw, 0);
2916 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2923 const auto pu16 =
reinterpret_cast<uint16_t*
>(p);
2928#if HWY_COMPILER_GCC_ACTUAL
2933template <typename T, size_t N>
2940template <
typename T,
size_t N>
2945 const auto blended =
2954template <
typename T,
size_t N>
2971 uint16x8_t a = vmovl_u8(
v.raw);
2986 uint16x8_t a = vmovl_u8(
v.raw);
2994template <
size_t N, HWY_IF_LE64(u
int16_t, N)>
2999template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3002 uint16x8_t a = vmovl_u8(
v.raw);
3010template <
size_t N, HWY_IF_LE64(u
int64_t, N)>
3015template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3020template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3023 uint16x8_t a = vmovl_u8(
v.raw);
3024 uint32x4_t b = vmovl_u16(vget_low_u16(a));
3027template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3030 uint32x4_t a = vmovl_u16(
v.raw);
3041 int16x8_t a = vmovl_s8(
v.raw);
3062 int16x8_t a = vmovl_s8(
v.raw);
3063 int32x4_t b = vmovl_s16(vget_low_s16(a));
3079HWY_API Vec128<float> PromoteTo(Full128<float> ,
3080 const Vec128<float16_t, 4> v) {
3081 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3082 return Vec128<float>(f32);
3085HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> ,
3086 const Vec128<float16_t, N> v) {
3087 const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
3088 return Vec128<float, N>(vget_low_f32(f32));
3100 const auto sign = ShiftRight<15>(bits16);
3101 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3102 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3103 const auto subnormal =
3105 Set(df32, 1.0f / 16384 / 1024));
3107 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3108 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3109 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3110 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3111 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3118HWY_API Vec128<double> PromoteTo(Full128<double> ,
3119 const Vec64<float> v) {
3120 return Vec128<double>(vcvt_f64_f32(v.raw));
3123HWY_API Vec64<double> PromoteTo(Full64<double> ,
3124 const Vec32<float> v) {
3125 return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
3129 const Vec64<int32_t> v) {
3130 const int64x2_t i64 = vmovl_s32(
v.raw);
3131 return Vec128<double>(vcvtq_f64_s64(i64));
3135 const Vec32<int32_t> v) {
3136 const int64x1_t i64 = vget_low_s64(vmovl_s32(
v.raw));
3137 return Vec64<double>(vcvt_f64_s64(i64));
3155 const uint16x4_t a = vqmovun_s32(
v.raw);
3164 const int16x4_t a = vqmovn_s32(
v.raw);
3173template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3178template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3183template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3186 const uint16x4_t a = vqmovun_s32(vcombine_s32(
v.raw,
v.raw));
3189template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3194template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3197 const int16x4_t a = vqmovn_s32(vcombine_s32(
v.raw,
v.raw));
3200template <
size_t N, HWY_IF_LE64(
int16_t, N)>
3208HWY_API Vec128<float16_t, 4> DemoteTo(Full64<float16_t> ,
3209 const Vec128<float> v) {
3210 return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
3214 const Vec128<float, N> v) {
3215 const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(
v.raw,
v.raw));
3216 return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
3225 const Rebind<uint32_t,
decltype(du16)> du;
3227 const auto bits32 =
BitCast(du,
v);
3228 const auto sign = ShiftRight<31>(bits32);
3229 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3230 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3232 const auto k15 =
Set(di, 15);
3233 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3234 const auto is_tiny = exp <
Set(di, -24);
3236 const auto is_subnormal = exp <
Set(di, -14);
3237 const auto biased_exp16 =
3239 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3240 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3241 (mantissa32 >> (
Set(du, 13) + sub_exp));
3243 ShiftRight<13>(mantissa32));
3245 const auto sign16 = ShiftLeft<15>(sign);
3246 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3256 const Rebind<int32_t,
decltype(dbf16)> di32;
3257 const Rebind<uint32_t,
decltype(dbf16)> du32;
3258 const Rebind<uint16_t,
decltype(dbf16)> du16;
3259 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3265HWY_API Vec64<float> DemoteTo(Full64<float> ,
const Vec128<double> v) {
3266 return Vec64<float>(vcvt_f32_f64(v.raw));
3268HWY_API Vec32<float> DemoteTo(Full32<float> ,
const Vec64<double> v) {
3269 return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
3273 const Vec128<double> v) {
3274 const int64x2_t i64 = vcvtq_s64_f64(
v.raw);
3275 return Vec64<int32_t>(vqmovn_s64(i64));
3278 const Vec64<double> v) {
3279 const int64x1_t i64 = vcvt_s64_f64(
v.raw);
3281 const int64x2_t i64x2 = vcombine_s64(i64, i64);
3282 return Vec32<int32_t>(vqmovn_s64(i64x2));
3288 const uint8x16_t org_v = detail::BitCastToByte(
v).raw;
3289 const uint8x16_t w = vuzp1q_u8(org_v, org_v);
3292template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3294 const uint8x8_t org_v = detail::BitCastToByte(
v).raw;
3295 const uint8x8_t w = vuzp1_u8(org_v, org_v);
3310 uint16x8_t c = vcombine_u16(a.
raw, b.
raw);
3319 int16x8_t c = vcombine_s16(a.
raw, b.
raw);
3328 const
Vec128<int32_t> v) {
3331template <
size_t N, HWY_IF_LE64(
int32_t, N)>
3341template <
size_t N, HWY_IF_LE64(u
int32_t, N)>
3352template <
size_t N, HWY_IF_LE64(
float, N)>
3360HWY_API Vec128<double> ConvertTo(Full128<double> ,
3361 const Vec128<int64_t> v) {
3362 return Vec128<double>(vcvtq_f64_s64(v.raw));
3364HWY_API Vec64<double> ConvertTo(Full64<double> ,
3365 const Vec64<int64_t> v) {
3366 return Vec64<double>(vcvt_f64_s64(v.raw));
3370 const Vec128<uint64_t> v) {
3371 return Vec128<double>(vcvtq_f64_u64(
v.raw));
3374 const Vec64<uint64_t> v) {
3375 return Vec64<double>(vcvt_f64_u64(
v.raw));
3380 const Vec128<double> v) {
3381 return Vec128<int64_t>(vcvtq_s64_f64(
v.raw));
3384 const Vec64<double> v) {
3385 return Vec64<int64_t>(vcvt_s64_f64(
v.raw));
3430 const auto int_f =
ConvertTo(df, integer);
3443 const auto max =
Set(df, MantissaEnd<float>());
3445 const auto added = large +
v;
3446 const auto rounded = added - large;
3458 const auto int_f =
ConvertTo(df, integer);
3472 const auto int_f =
ConvertTo(df, integer);
3486HWY_API Vec128<int32_t> NearestInt(
const Vec128<float> v) {
3487 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
3489template <
size_t N, HWY_IF_LE64(
float, N)>
3490HWY_API Vec128<int32_t, N> NearestInt(
const Vec128<float, N> v) {
3491 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
3505template <
typename T,
size_t N>
3510template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3516 return RebindMask(
d, Eq(Add(vi, vi),
Set(di, hwy::MaxExponentTimes2<T>())));
3520template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
3529 const VFromD<
decltype(di)> exp =
3531 return RebindMask(
d, Lt(exp,
Set(di, hwy::MaxExponentField<T>())));
3539template <
typename T,
size_t N, HWY_IF_LE64(u
int8_t, N)>
3572HWY_API Vec64<double> LowerHalf(
const Vec128<double> v) {
3573 return Vec64<double>(vget_low_f64(v.raw));
3582template <
typename T,
size_t N>
3591template <
int kBytes,
typename T,
class V128 = Vec128<T>>
3593 static_assert(0 < kBytes && kBytes < 16,
"kBytes must be in [1, 15]");
3595 uint8x16_t v8 = vextq_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3600template <
int kBytes,
typename T>
3602 static_assert(0 < kBytes && kBytes < 8,
"kBytes must be in [1, 7]");
3604 uint8x8_t v8 = vext_u8(
BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes);
3616template <
int kBytes>
3626 template <
class T,
size_t N, HWY_IF_LE64(T, N)>
3630 const auto zero64 =
Zero(d64);
3631 const decltype(zero64) v64(
v.raw);
3633 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
3638 template <
class T,
size_t N>
3645 template <
class T,
size_t N>
3651template <
int kBytes>
3653 template <
class T,
size_t N>
3657 if (
N *
sizeof(T) < 8) {
3658 constexpr size_t kReg =
N *
sizeof(T) == 16 ? 16 : 8;
3659 const Simd<T, kReg /
sizeof(T), 0> dreg;
3663 return CombineShiftRightBytes<kBytes>(
d,
Zero(
d),
v);
3668 template <
class T,
size_t N>
3675 template <
class T,
size_t N>
3683template <
int kBytes,
typename T,
size_t N>
3685 return detail::ShiftLeftBytesT < kBytes >=
N *
sizeof(T) ? 0xFF
3689template <
int kBytes,
typename T,
size_t N>
3694template <
int kLanes,
typename T,
size_t N>
3700template <
int kLanes,
typename T,
size_t N>
3706template <
int kBytes,
typename T,
size_t N>
3708 return detail::ShiftRightBytesT < kBytes >=
N *
sizeof(T) ? 0xFF
3712template <
int kLanes,
typename T,
size_t N>
3719template <
int kBytes,
typename T,
size_t N, HWY_IF_LE32(T, N)>
3722 constexpr size_t kSize =
N *
sizeof(T);
3723 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3727 using V64 =
VFromD<
decltype(d_full8)>;
3728 const V64 hi64(
BitCast(d8, hi).raw);
3775HWY_API Vec64<double> UpperHalf(Full64<double> ,
3776 const Vec128<double> v) {
3777 return Vec64<double>(vget_high_f64(v.raw));
3784 const Twice<
decltype(duh)> du;
3789template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3796 return Vec128<T, (
N + 1) / 2>(upper.raw);
3804HWY_API Vec128<uint16_t> Broadcast(
const Vec128<uint16_t> v) {
3805 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3806 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
3808template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3809HWY_API Vec128<uint16_t, N> Broadcast(
const Vec128<uint16_t, N> v) {
3810 static_assert(0 <= kLane && kLane < N,
"Invalid lane");
3811 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
3815 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3816 return Vec128<uint32_t>(vdupq_laneq_u32(
v.raw, kLane));
3818template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3820 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3821 return Vec128<uint32_t, N>(vdup_lane_u32(
v.raw, kLane));
3825 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3826 return Vec128<uint64_t>(vdupq_laneq_u64(
v.raw, kLane));
3833 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3834 return Vec128<int16_t>(vdupq_laneq_s16(
v.raw, kLane));
3836template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3838 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3839 return Vec128<int16_t, N>(vdup_lane_s16(
v.raw, kLane));
3843 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3844 return Vec128<int32_t>(vdupq_laneq_s32(
v.raw, kLane));
3846template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3848 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3849 return Vec128<int32_t, N>(vdup_lane_s32(
v.raw, kLane));
3853 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3854 return Vec128<int64_t>(vdupq_laneq_s64(
v.raw, kLane));
3861 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3862 return Vec128<float>(vdupq_laneq_f32(
v.raw, kLane));
3864template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3866 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3867 return Vec128<float, N>(vdup_lane_f32(
v.raw, kLane));
3871 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3872 return Vec128<double>(vdupq_laneq_f64(
v.raw, kLane));
3876 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3886 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3889template <
int kLane,
size_t N, HWY_IF_LE64(u
int16_t, N)>
3891 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3896 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3899template <
int kLane,
size_t N, HWY_IF_LE64(u
int32_t, N)>
3901 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3906 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3914 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
3917template <
int kLane,
size_t N, HWY_IF_LE64(
int16_t, N)>
3919 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3924 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3927template <
int kLane,
size_t N, HWY_IF_LE64(
int32_t, N)>
3929 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3934 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
3942 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
3945template <
int kLane,
size_t N, HWY_IF_LE64(
float, N)>
3947 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3955 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3960 static_assert(0 <= kLane && kLane < 1,
"Invalid lane");
3967template <
typename T,
size_t N>
3972template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3974 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3975#if HWY_IS_DEBUG_BUILD
3976 const Rebind<TI,
decltype(
d)> di;
3978 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(
N)))));
3982 using V8 =
VFromD<
decltype(d8)>;
3986 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
3987 if (
sizeof(T) == 4) {
3988 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
3989 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3990 const V8 lane_indices =
3992 const V8 byte_indices =
3994 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3995 0, 1, 2, 3, 0, 1, 2, 3};
3996 const V8 sum = Add(byte_indices,
Load(d8, kByteOffsets));
3999 alignas(16)
constexpr uint8_t kBroadcastLaneBytes[16] = {
4000 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
4001 const V8 lane_indices =
4003 const V8 byte_indices =
4005 alignas(16)
constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
4006 0, 1, 2, 3, 4, 5, 6, 7};
4007 const V8 sum = Add(byte_indices,
Load(d8, kByteOffsets));
4012template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
4014 const Rebind<TI,
decltype(
d)> di;
4018template <
typename T,
size_t N>
4029template <
typename T>
4035template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4040template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4046template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4047HWY_API Vec128<T> Reverse(Full128<T> ,
const Vec128<T> v) {
4048 return Shuffle0123(v);
4052template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4060template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4065template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4071template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
4072HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d,
const Vec128<T, N> v) {
4073 const RebindToUnsigned<
decltype(d)> du;
4074 return BitCast(d, Vec128<uint32_t, N>(vrev64_u32(BitCast(du, v).raw)));
4076template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4077HWY_API Vec128<T> Reverse2(Full128<T> d,
const Vec128<T> v) {
4078 const RebindToUnsigned<
decltype(d)> du;
4079 return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
4082template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4089template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
4094template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4100template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4105template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4106HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> ,
const Vec128<T, N>) {
4112template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4117template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4118HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>,
const Vec128<T, N>) {
4130template <
typename T>
4134template <
typename T>
4140template <
typename T>
4146template <
typename T>
4152template <
typename T>
4167HWY_API Vec128<uint64_t> InterleaveLower(
const Vec128<uint64_t> a,
4168 const Vec128<uint64_t> b) {
4169 return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
4171HWY_API Vec128<int64_t> InterleaveLower(
const Vec128<int64_t> a,
4172 const Vec128<int64_t> b) {
4173 return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
4176 const Vec128<double> b) {
4177 return Vec128<double>(vzip1q_f64(a.raw, b.raw));
4196template <
size_t N, HWY_IF_LE64(
float, N)>
4203template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4209template <
typename T,
size_t N,
class V = Vec128<T, N>>
4223HWY_API Vec128<uint64_t> InterleaveUpper(
const Vec128<uint64_t> a,
4224 const Vec128<uint64_t> b) {
4225 return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
4227HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
4228 return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
4231 return Vec128<double>(vzip2q_f64(a.raw, b.raw));
4255template <
typename T,
size_t N, HWY_IF_GE64(T, N),
class V = Vec128<T, N>>
4257 return detail::InterleaveUpper(a, b);
4261template <
typename T,
size_t N, HWY_IF_LE32(T, N),
class V = Vec128<T, N>>
4262HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4263 const Half<
decltype(d)> d2;
4264 return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
4271template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4275template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4280template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
4282 return BitCast(dw, InterleaveUpper(D(), a, b));
4293 const Rebind<uint32_t,
decltype(df32)> du32;
4294 using VU32 =
VFromD<
decltype(du32)>;
4295 const VU32 odd =
Set(du32, 0xFFFF0000u);
4297 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
4299 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
4391HWY_API Vec128<double> Combine(Full128<double> , Vec64<double> hi,
4393 return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
4398template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4414 return Add(sum0, sum1);
4424 const Half<
decltype(
d)> d64;
4447template <
typename T,
size_t N>
4455template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4470#define HWY_NEON_BUILD_TPL_HWY_TRN
4471#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
4474#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
4475 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
4476#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
4498template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4506 using VU =
VFromD<
decltype(du)>;
4508 d, VU(detail::InterleaveEvenOdd(
BitCast(du, lo).raw,
BitCast(du, hi).raw)
4516template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4525template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4526HWY_API Vec128<T, N> ConcatUpperUpper(
const Simd<T, N, 0> d, Vec128<T, N> hi,
4531 return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
4533 using VU = VFromD<
decltype(du)>;
4535 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
4543template <
typename T,
size_t N, HWY_IF_GE64(T, N)>
4550template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
4551HWY_API Vec128<T, N> ConcatLowerUpper(
const Simd<T, N, 0> d, Vec128<T, N> hi,
4553 constexpr size_t kSize = N *
sizeof(T);
4554 const Repartition<uint8_t,
decltype(d)> d8;
4555 const Full64<uint8_t> d8x8;
4556 const Full64<T> d64;
4557 using V8x8 = VFromD<
decltype(d8x8)>;
4558 const V8x8 hi8x8(BitCast(d8, hi).raw);
4560 const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
4561 const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
4563 return Vec128<T, N>(BitCast(d64, r).raw);
4569template <
typename T,
size_t N>
4584template <typename T,
size_t N,
4585 hwy::
EnableIf<N != 2 && sizeof(T) * N >= 8>* =
nullptr>
4588 return detail::ConcatOdd(lo, hi);
4592template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4595 const Twice<
decltype(
d)> d2;
4606template <
typename T>
4609 return InterleaveUpper(
d, lo, hi);
4615template <
typename T,
size_t N,
4619 return detail::ConcatEven(lo, hi);
4623template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4626 const Twice<
decltype(
d)> d2;
4637template <
typename T>
4645template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4648 return detail::InterleaveEven(
v,
v);
4650 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[0]);
4654template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4655HWY_API Vec128<T, N> DupEven(
const Vec128<T, N> v) {
4656 return InterleaveLower(Simd<T, N, 0>(), v, v);
4661template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4664 return detail::InterleaveOdd(
v,
v);
4666 return Vec128<T, N>(detail::InterleaveEvenOdd(
v.raw,
v.raw).val[1]);
4670template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4671HWY_API Vec128<T, N> DupOdd(
const Vec128<T, N> v) {
4672 return InterleaveUpper(Simd<T, N, 0>(), v, v);
4677template <
typename T,
size_t N>
4681 alignas(16)
constexpr uint8_t kBytes[16] = {
4682 ((0 /
sizeof(T)) & 1) ? 0 : 0xFF, ((1 /
sizeof(T)) & 1) ? 0 : 0xFF,
4683 ((2 /
sizeof(T)) & 1) ? 0 : 0xFF, ((3 /
sizeof(T)) & 1) ? 0 : 0xFF,
4684 ((4 /
sizeof(T)) & 1) ? 0 : 0xFF, ((5 /
sizeof(T)) & 1) ? 0 : 0xFF,
4685 ((6 /
sizeof(T)) & 1) ? 0 : 0xFF, ((7 /
sizeof(T)) & 1) ? 0 : 0xFF,
4686 ((8 /
sizeof(T)) & 1) ? 0 : 0xFF, ((9 /
sizeof(T)) & 1) ? 0 : 0xFF,
4687 ((10 /
sizeof(T)) & 1) ? 0 : 0xFF, ((11 /
sizeof(T)) & 1) ? 0 : 0xFF,
4688 ((12 /
sizeof(T)) & 1) ? 0 : 0xFF, ((13 /
sizeof(T)) & 1) ? 0 : 0xFF,
4689 ((14 /
sizeof(T)) & 1) ? 0 : 0xFF, ((15 /
sizeof(T)) & 1) ? 0 : 0xFF,
4696template <
typename T,
size_t N>
4703template <
typename T,
size_t N>
4711template <
typename T>
4722 const Repartition<uint32_t,
decltype(dbf16)> du32;
4735 return Combine(d16, a16, b16);
4755#if defined(__ARM_FEATURE_AES) || \
4756 (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64)
4759#ifdef HWY_NATIVE_AES
4760#undef HWY_NATIVE_AES
4762#define HWY_NATIVE_AES
4765HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4766 Vec128<uint8_t> round_key) {
4771 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
4775HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
4776 Vec128<uint8_t> round_key) {
4777 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
4781 return Vec128<uint64_t>((uint64x2_t)vmull_p64(
GetLane(a),
GetLane(b)));
4785 return Vec128<uint64_t>(
4786 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
4796 const Rebind<uint16_t,
decltype(df32)> du16;
4817 const auto v2 = detail::ConcatEven(v1, v1);
4818 const auto v3 = detail::ConcatEven(v2, v2);
4819 const auto v4 = detail::ConcatEven(v3, v3);
4827 const auto v2 = detail::ConcatEven(v1, v1);
4828 const auto v3 = detail::ConcatEven(v2, v2);
4836 const auto v2 = detail::ConcatEven(v1, v1);
4840template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
4845 const auto v2 = detail::ConcatEven(v1, v1);
4846 const auto v3 = detail::ConcatEven(v2, v2);
4850template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
4855 const auto v2 = detail::ConcatEven(v1, v1);
4859template <
size_t N, hwy::EnableIf<N >= 2>* =
nullptr>
4864 const auto v2 = detail::ConcatEven(v1, v1);
4877 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
4884 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
4893 return Vec128<int64_t, (
N + 1) / 2>(
4894 vget_low_s64(vmull_s32(a_packed, b_packed)));
4902 return Vec128<uint64_t, (
N + 1) / 2>(
4903 vget_low_u64(vmull_u32(a_packed, b_packed)));
4908 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 0), vgetq_lane_u64(b.
raw, 0), &hi);
4914 uint64_t lo =
Mul128(vgetq_lane_u64(a.
raw, 1), vgetq_lane_u64(b.
raw, 1), &hi);
4921template <
typename T,
typename TI>
4930 uint8x16_t table0 =
BitCast(d8, bytes).raw;
4932 table.val[0] = vget_low_u8(table0);
4933 table.val[1] = vget_high_u8(table0);
4934 uint8x16_t idx =
BitCast(d8, from).raw;
4935 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
4936 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
4942template <
typename T,
typename TI,
size_t NI, HWY_IF_LE64(TI, NI)>
4947 const auto idx_full =
Combine(d_full, from64, from64);
4953template <
typename T,
size_t N,
typename TI, HWY_IF_LE64(T, N)>
4961template <
typename T,
size_t N,
typename TI,
size_t NI,
HWY_IF_LE64(T, N),
4967 const Repartition<uint8_t,
decltype(d_idx)> d_idx8;
4970 const auto from8 =
BitCast(d_idx8, from);
4971 const VFromD<
decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
4976template <
class V,
class VI>
4983template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
4987 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
4989 alignas(16) T lanes[
N];
4992 alignas(16) Offset offset_lanes[
N];
4993 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
4995 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
4996 for (
size_t i = 0; i <
N; ++i) {
4997 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
5001template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
5004 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
5006 alignas(16) T lanes[
N];
5009 alignas(16) Index index_lanes[
N];
5010 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
5012 for (
size_t i = 0; i <
N; ++i) {
5013 base[index_lanes[i]] = lanes[i];
5019template <
typename T,
size_t N,
typename Offset>
5023 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
5025 alignas(16) Offset offset_lanes[
N];
5026 Store(offset,
Rebind<Offset,
decltype(
d)>(), offset_lanes);
5028 alignas(16) T lanes[
N];
5029 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
5030 for (
size_t i = 0; i <
N; ++i) {
5031 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
5033 return Load(
d, lanes);
5036template <
typename T,
size_t N,
typename Index>
5040 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
5042 alignas(16) Index index_lanes[
N];
5043 Store(index,
Rebind<Index,
decltype(
d)>(), index_lanes);
5045 alignas(16) T lanes[
N];
5046 for (
size_t i = 0; i <
N; ++i) {
5047 lanes[i] = base[index_lanes[i]];
5049 return Load(
d, lanes);
5057template <
typename T>
5062template <
typename T>
5067template <
typename T>
5075#define HWY_NEON_BUILD_RET_REDUCTION(type, size) Vec128<type##_t, size>
5076#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix, dup) \
5077 HWY_API HWY_NEON_BUILD_RET_REDUCTION(type, size) \
5078 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5079 return HWY_NEON_BUILD_RET_REDUCTION( \
5080 type, size)(dup##suffix(HWY_NEON_EVAL(prefix##infix##suffix, v.raw))); \
5083#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
5084 HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8, vdup_n_) \
5085 HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8, vdupq_n_) \
5086 HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16, vdup_n_) \
5087 HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16, vdupq_n_) \
5088 HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32, vdup_n_) \
5089 HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32, vdupq_n_) \
5090 HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8, vdup_n_) \
5091 HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8, vdupq_n_) \
5092 HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16, vdup_n_) \
5093 HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16, vdupq_n_) \
5094 HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32, vdup_n_) \
5095 HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32, vdupq_n_) \
5096 HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32, vdup_n_) \
5097 HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32, vdupq_n_) \
5098 HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64, vdupq_n_)
5100HWY_NEON_DEF_REDUCTION_CORE_TYPES(MinOfLanes, vminv)
5101HWY_NEON_DEF_REDUCTION_CORE_TYPES(MaxOfLanes, vmaxv)
5104#define HWY_NEON_DEF_REDUCTION_ALL_TYPES(name, prefix) \
5105 HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \
5106 HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64, vdupq_n_) \
5107 HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64, vdupq_n_)
5109HWY_NEON_DEF_REDUCTION_ALL_TYPES(SumOfLanes, vaddv)
5111#undef HWY_NEON_DEF_REDUCTION_ALL_TYPES
5112#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
5113#undef HWY_NEON_DEF_REDUCTION
5114#undef HWY_NEON_BUILD_RET_REDUCTION
5117#define HWY_IF_SUM_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 1 << 2)
5118#define HWY_IF_MINMAX_REDUCTION(T) \
5119 HWY_IF_LANE_SIZE_ONE_OF(T, (1 << 8) | (1 << 2))
5123template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
5128template <
typename T>
5133template <
typename T>
5142 uint32x4x2_t v0 = vuzpq_u32(
v.raw,
v.raw);
5143 uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
5144 uint32x4x2_t v1 = vuzpq_u32(c0, c0);
5149 int32x4x2_t v0 = vuzpq_s32(
v.raw,
v.raw);
5150 int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
5151 int32x4x2_t v1 = vuzpq_s32(c0, c0);
5156 float32x4x2_t v0 = vuzpq_f32(
v.raw,
v.raw);
5157 float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
5158 float32x4x2_t v1 = vuzpq_f32(c0, c0);
5170template <
typename T>
5176 return Min(v20_31_20_31, v31_20_31_20);
5178template <
typename T>
5184 return Max(v20_31_20_31, v31_20_31_20);
5187#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t
5188#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128<type##_t, size>
5189#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \
5190 HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) \
5191 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5192 HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \
5193 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5194 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5195 return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
5196 type, size)(HWY_NEON_EVAL(vdup##_lane_##suffix, tmp, 0)); \
5198#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \
5200 HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) \
5201 name(hwy::SizeTag<sizeof(type##_t)>, const Vec128<type##_t, size> v) { \
5202 HWY_NEON_BUILD_TYPE_T(type, half) tmp; \
5203 tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \
5204 vget_low_##suffix(v.raw)); \
5205 if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5206 if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5207 if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \
5208 tmp = vdup_lane_##suffix(tmp, 0); \
5209 return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \
5210 type, size)(HWY_NEON_EVAL(vcombine_##suffix, tmp, tmp)); \
5213#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \
5214 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \
5215 HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \
5216 HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \
5217 HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \
5218 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \
5219 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \
5220 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \
5221 HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8)
5227#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS
5228#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION
5229#undef HWY_NEON_DEF_PAIRWISE_REDUCTION
5230#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION
5231#undef HWY_NEON_BUILD_TYPE_T
5233template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
5239 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5244template <
size_t N, HWY_IF_GE32(
int16_t, N)>
5250 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5251 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5257template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
5263 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5268template <
size_t N, HWY_IF_GE32(
int16_t, N)>
5274 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5275 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5281template <
size_t N, HWY_IF_GE32(u
int16_t, N)>
5287 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5292template <
size_t N, HWY_IF_GE32(
int16_t, N)>
5298 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5299 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5306#define HWY_IF_SUM_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 0)
5307#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_LANE_SIZE_ONE_OF(T, 1 << 8)
5312template <
typename T, HWY_IF_SUM_REDUCTION(T)>
5317template <
typename T, HWY_IF_MINMAX_REDUCTION(T)>
5322template <
typename T, HWY_IF_MINMAX_REDUCTION(T)>
5328#undef HWY_IF_SUM_REDUCTION
5329#undef HWY_IF_MINMAX_REDUCTION
5333template <
typename T,
size_t N>
5337template <
typename T,
size_t N>
5341template <
typename T,
size_t N>
5353template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5358template <
typename T>
5363template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
5368 const auto vmask_bits =
Set64(du, mask_bits);
5371 alignas(16)
constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
5372 1, 1, 1, 1, 1, 1, 1, 1};
5375 alignas(16)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
5376 1, 2, 4, 8, 16, 32, 64, 128};
5380template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5383 alignas(16)
constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
5384 const auto vmask_bits = Set(du,
static_cast<uint16_t
>(mask_bits));
5385 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5388template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
5389HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5390 const RebindToUnsigned<
decltype(d)> du;
5391 alignas(16)
constexpr uint32_t kBit[8] = {1, 2, 4, 8};
5392 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
5396template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5399 alignas(16)
constexpr uint64_t kBit[8] = {1, 2};
5406template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5409 uint64_t mask_bits = 0;
5411 return detail::LoadMaskBits(
d, mask_bits);
5420template <
typename T>
5428template <
typename T>
5431 const Twice<
decltype(
d)> d2;
5437template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
5442 constexpr size_t kBytes =
sizeof(T) *
N;
5443 return nib & ((1ull << (kBytes * 4)) - 1);
5446template <
typename T>
5449 alignas(16)
constexpr uint8_t kSliceLanes[16] = {
5450 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
5458 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.
raw, values.
raw));
5459 const uint8x8_t x4 = vpadd_u8(x2, x2);
5460 const uint8x8_t x8 = vpadd_u8(x4, x4);
5461 return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
5464 const uint16x8_t x2 = vpaddlq_u8(values.
raw);
5465 const uint32x4_t x4 = vpaddlq_u16(x2);
5466 const uint64x2_t x8 = vpaddlq_u32(x4);
5467 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
5471template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5476 alignas(8)
constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8,
5477 0x10, 0x20, 0x40, 0x80};
5484 return vaddv_u8(values.
raw);
5486 const uint16x4_t x2 = vpaddl_u8(values.
raw);
5487 const uint32x2_t x4 = vpaddl_u16(x2);
5488 const uint64x1_t x8 = vpaddl_u32(x4);
5489 return vget_lane_u64(x8, 0);
5493template <
typename T>
5496 alignas(16)
constexpr uint16_t kSliceLanes[8] = {1, 2, 4, 8,
5497 0x10, 0x20, 0x40, 0x80};
5503 return vaddvq_u16(values.
raw);
5505 const uint32x4_t x2 = vpaddlq_u16(values.
raw);
5506 const uint64x2_t x4 = vpaddlq_u32(x2);
5507 return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
5511template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5516 alignas(8)
constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
5522 return vaddv_u16(values.
raw);
5524 const uint32x2_t x2 = vpaddl_u16(values.
raw);
5525 const uint64x1_t x4 = vpaddl_u32(x2);
5526 return vget_lane_u64(x4, 0);
5530template <
typename T>
5533 alignas(16)
constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
5539 return vaddvq_u32(values.
raw);
5541 const uint64x2_t x2 = vpaddlq_u32(values.
raw);
5542 return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
5546template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5551 alignas(8)
constexpr uint32_t kSliceLanes[2] = {1, 2};
5557 return vaddv_u32(values.
raw);
5559 const uint64x1_t x2 = vpaddl_u32(values.
raw);
5560 return vget_lane_u64(x2, 0);
5564template <
typename T>
5566 alignas(16)
constexpr uint64_t kSliceLanes[2] = {1, 2};
5572 return vaddvq_u64(values.
raw);
5574 return vgetq_lane_u64(values.
raw, 0) + vgetq_lane_u64(values.
raw, 1);
5578template <
typename T>
5584 return vget_lane_u64(values.
raw, 0);
5588template <
typename T,
size_t N>
5590 return ((
N *
sizeof(T)) >= 8) ? bits : (bits & ((1ull <<
N) - 1));
5593template <
typename T,
size_t N>
5608template <
typename T>
5611 const int8x16_t ones =
5615 return static_cast<size_t>(vaddvq_s8(ones));
5617 const int16x8_t x2 = vpaddlq_s8(ones);
5618 const int32x4_t x4 = vpaddlq_s16(x2);
5619 const int64x2_t x8 = vpaddlq_s32(x4);
5620 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
5623template <
typename T>
5626 const int16x8_t ones =
5630 return static_cast<size_t>(vaddvq_s16(ones));
5632 const int32x4_t x2 = vpaddlq_s16(ones);
5633 const int64x2_t x4 = vpaddlq_s32(x2);
5634 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
5638template <
typename T>
5641 const int32x4_t ones =
5645 return static_cast<size_t>(vaddvq_s32(ones));
5647 const int64x2_t x2 = vpaddlq_s32(ones);
5648 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
5652template <
typename T>
5656 const int64x2_t ones =
5658 return static_cast<size_t>(vaddvq_s64(ones));
5662 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
5663 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
5670template <
typename T>
5672 return detail::CountTrue(
hwy::SizeTag<
sizeof(T)>(), mask);
5676template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5678 constexpr int kDiv = 4 *
sizeof(T);
5679 return PopCount(detail::NibblesFromMask(
d, mask)) / kDiv;
5682template <
typename T,
size_t N>
5685 const uint64_t nib = detail::NibblesFromMask(
d, mask);
5686 constexpr size_t kDiv = 4 *
sizeof(T);
5690template <
typename T,
size_t N>
5693 const uint64_t nib = detail::NibblesFromMask(
d, mask);
5694 if (nib == 0)
return -1;
5695 constexpr int kDiv = 4 *
sizeof(T);
5700template <
typename T,
size_t N>
5703 const uint64_t mask_bits = detail::BitsFromMask(mask);
5704 const size_t kNumBytes = (
N + 7) / 8;
5705 CopyBytes<kNumBytes>(&mask_bits, bits);
5709template <
typename T,
size_t N>
5711 return detail::NibblesFromMask(
d, m) == 0;
5715template <
typename T>
5717 return detail::NibblesFromMask(
d, m) == ~0ull;
5720template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
5722 constexpr size_t kBytes =
sizeof(T) *
N;
5723 return detail::NibblesFromMask(
d, m) == (1ull << (kBytes * 4)) - 1;
5728template <
typename T>
5730 enum { value = (
sizeof(T) != 1) };
5737 const uint8_t* bytes) {
5739 vld1q_dup_u64(
reinterpret_cast<const uint64_t*
>(bytes))));
5743template <
size_t N, HWY_IF_LE64(u
int8_t, N)>
5745 const uint8_t* bytes) {
5746 return Load(
d, bytes);
5749template <
typename T,
size_t N>
5751 const uint64_t mask_bits) {
5765 alignas(16)
constexpr uint8_t table[256 * 8] = {
5767 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5768 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5769 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5770 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5771 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5772 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5773 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5774 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5775 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5776 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5777 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5778 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5779 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5780 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5781 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5782 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5783 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5784 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5785 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5786 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5787 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5788 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5789 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5790 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5791 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5792 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5793 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5794 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5795 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5796 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5797 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5798 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5799 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5800 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5801 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5802 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5803 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5804 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5805 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5806 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5807 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5808 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5809 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5810 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5811 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5812 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5813 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5814 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5815 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5816 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5817 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5818 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5819 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5820 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5821 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5822 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5823 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5824 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5825 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5826 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5827 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5828 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5829 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5830 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5831 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5832 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5833 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5834 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5835 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5836 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5837 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5838 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5839 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5840 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5841 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5842 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5843 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5844 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5845 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5846 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5847 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5848 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5849 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5850 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5851 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5852 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5853 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5854 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5855 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5856 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5857 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5858 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5859 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5860 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5861 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5862 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5863 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5864 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5865 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5866 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5867 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5868 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5869 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5870 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5871 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5872 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5873 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5874 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5875 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5876 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5877 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5878 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5879 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5880 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5881 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5882 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5883 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5884 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5885 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5886 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5887 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5888 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5889 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5890 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5891 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5892 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5893 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5894 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5901template <
typename T,
size_t N>
5903 const uint64_t mask_bits) {
5917 alignas(16)
constexpr uint8_t table[256 * 8] = {
5919 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0,
5920 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2,
5921 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4,
5922 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4,
5923 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6,
5924 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6,
5925 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6,
5926 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6,
5927 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8,
5928 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8,
5929 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8,
5930 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8,
5931 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8,
5932 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8,
5933 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8,
5934 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8,
5935 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10,
5936 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10,
5937 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10,
5938 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10,
5939 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10,
5940 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10,
5941 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10,
5942 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10,
5943 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10,
5944 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10,
5945 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10,
5946 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10,
5947 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10,
5948 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10,
5949 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10,
5950 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10,
5951 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12,
5952 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12,
5953 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12,
5954 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12,
5955 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12,
5956 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12,
5957 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12,
5958 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12,
5959 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12,
5960 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12,
5961 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12,
5962 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12,
5963 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12,
5964 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12,
5965 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12,
5966 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12,
5967 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12,
5968 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12,
5969 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12,
5970 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12,
5971 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12,
5972 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12,
5973 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12,
5974 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12,
5975 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12,
5976 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12,
5977 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12,
5978 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12,
5979 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12,
5980 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12,
5981 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12,
5982 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12,
5983 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14,
5984 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14,
5985 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14,
5986 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14,
5987 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14,
5988 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14,
5989 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14,
5990 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14,
5991 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14,
5992 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14,
5993 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14,
5994 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14,
5995 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14,
5996 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14,
5997 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14,
5998 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14,
5999 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14,
6000 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14,
6001 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14,
6002 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14,
6003 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14,
6004 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14,
6005 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14,
6006 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14,
6007 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14,
6008 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14,
6009 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14,
6010 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14,
6011 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14,
6012 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14,
6013 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14,
6014 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14,
6015 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14,
6016 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14,
6017 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14,
6018 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14,
6019 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14,
6020 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14,
6021 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14,
6022 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14,
6023 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14,
6024 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14,
6025 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14,
6026 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14,
6027 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14,
6028 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14,
6029 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14,
6030 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14,
6031 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14,
6032 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14,
6033 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14,
6034 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14,
6035 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14,
6036 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14,
6037 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14,
6038 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14,
6039 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14,
6040 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14,
6041 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14,
6042 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14,
6043 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14,
6044 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14,
6045 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14,
6046 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6053template <
typename T,
size_t N>
6055 const uint64_t mask_bits) {
6059 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
6061 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6062 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6063 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
6064 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6065 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
6066 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
6067 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
6068 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6069 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6070 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
6071 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
6072 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6073 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6074 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
6075 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
6076 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6079 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6082template <
typename T,
size_t N>
6084 const uint64_t mask_bits) {
6088 alignas(16)
constexpr uint8_t u8_indices[16 * 16] = {
6090 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6091 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6092 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6093 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6094 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6095 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6096 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6097 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6098 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6099 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6100 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6101 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6102 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6103 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6107 return BitCast(
d,
Load(d8, u8_indices + 16 * mask_bits));
6110#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
6112template <
typename T,
size_t N>
6114 const uint64_t mask_bits) {
6118 alignas(16)
constexpr uint8_t u8_indices[64] = {
6120 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6121 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6122 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6123 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6125 const Simd<T, N, 0> d;
6126 const Repartition<uint8_t,
decltype(d)> d8;
6127 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6130template <
typename T,
size_t N>
6132 const uint64_t mask_bits) {
6136 alignas(16)
constexpr uint8_t u8_indices[4 * 16] = {
6138 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6139 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6140 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6141 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6143 const Simd<T, N, 0>
d;
6145 return BitCast(d,
Load(d8, u8_indices + 16 * mask_bits));
6152template <
typename T,
size_t N>
6155 detail::IdxFromBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
6161template <
typename T,
size_t N>
6164 detail::IdxFromNotBits<T, N>(
hwy::SizeTag<
sizeof(T)>(), mask_bits);
6173template <
typename T>
6179template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
6191template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6192HWY_API Vec128<T, N> Compress(Vec128<T, N> v,
const Mask128<T, N> mask) {
6193 return detail::Compress(v, detail::BitsFromMask(mask));
6197template <
typename T>
6203template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6215template <
typename T,
size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6219 if (
N < 16 /
sizeof(T)) {
6220 return detail::Compress(
v, detail::BitsFromMask(
Not(mask)));
6222 return detail::CompressNot(
v, detail::BitsFromMask(mask));
6233template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6236 uint64_t mask_bits = 0;
6237 constexpr size_t kNumBytes = (
N + 7) / 8;
6238 CopyBytes<kNumBytes>(bits, &mask_bits);
6240 mask_bits &= (1ull <<
N) - 1;
6243 return detail::Compress(
v, mask_bits);
6247template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6250 const uint64_t mask_bits = detail::BitsFromMask(mask);
6251 StoreU(detail::Compress(
v, mask_bits),
d, unaligned);
6256template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6261 using TU =
TFromD<
decltype(du)>;
6262 const uint64_t mask_bits = detail::BitsFromMask(m);
6263 const size_t count =
PopCount(mask_bits);
6272template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6276 uint64_t mask_bits = 0;
6277 constexpr size_t kNumBytes = (
N + 7) / 8;
6278 CopyBytes<kNumBytes>(bits, &mask_bits);
6280 mask_bits &= (1ull <<
N) - 1;
6283 StoreU(detail::Compress(
v, mask_bits),
d, unaligned);
6290#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
6291#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
6293#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
6297#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
6298#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
6301#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
6302#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6305#define HWY_IF_LOAD_INT(T, N) \
6306 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6307#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
6308 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6309 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6310 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6311 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6312 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6318#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6319 decltype(Tuple2<type##_t, size>().raw)
6321#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6322 const type##_t *from, Tuple2<type##_t, size>
6324#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6325#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6327#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6328 decltype(Tuple3<type##_t, size>().raw)
6329#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6330 const type##_t *from, Tuple3<type##_t, size>
6332#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6333#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6335#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
6336 decltype(Tuple4<type##_t, size>().raw)
6337#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
6338 const type##_t *from, Tuple4<type##_t, size>
6340#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
6341#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
6343#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
6344#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
6345#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
6348template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6358template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6359HWY_API void LoadInterleaved2(Simd<T, N, 0> ,
6363 alignas(16) T buf[2 * 8 /
sizeof(T)] = {};
6364 CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
6365 auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, N>());
6366 v0 = Vec128<T, N>(raw.val[0]);
6367 v1 = Vec128<T, N>(raw.val[1]);
6372template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6374 Vec128<T>& v0, Vec128<T>& v1) {
6375 const Half<
decltype(
d)> dh;
6376 VFromD<
decltype(dh)> v00, v10, v01, v11;
6386template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6397template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6398HWY_API void LoadInterleaved3(Simd<T, N, 0> ,
6400 Vec128<T, N>& v1, Vec128<T, N>& v2) {
6402 alignas(16) T buf[3 * 8 /
sizeof(T)] = {};
6403 CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
6404 auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, N>());
6405 v0 = Vec128<T, N>(raw.val[0]);
6406 v1 = Vec128<T, N>(raw.val[1]);
6407 v2 = Vec128<T, N>(raw.val[2]);
6412template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6414 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
6415 const Half<
decltype(
d)> dh;
6416 VFromD<
decltype(dh)> v00, v10, v20, v01, v11, v21;
6427template <
typename T,
size_t N, HWY_IF_LOAD_INT(T, N)>
6440template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6441HWY_API void LoadInterleaved4(Simd<T, N, 0> ,
6443 Vec128<T, N>& v1, Vec128<T, N>& v2,
6445 alignas(16) T buf[4 * 8 /
sizeof(T)] = {};
6446 CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
6447 auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, N>());
6448 v0 = Vec128<T, N>(raw.val[0]);
6449 v1 = Vec128<T, N>(raw.val[1]);
6450 v2 = Vec128<T, N>(raw.val[2]);
6451 v3 = Vec128<T, N>(raw.val[3]);
6456template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6458 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
6460 const Half<
decltype(
d)> dh;
6461 VFromD<
decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
6471#undef HWY_IF_LOAD_INT
6476#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
6477#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
6478#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
6481#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
6482#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
6485#define HWY_IF_STORE_INT(T, N) \
6486 hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
6487#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
6488 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
6489 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
6490 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \
6491 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
6492 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
6495#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6496 Tuple2<type##_t, size> tup, type##_t *to
6498#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6500#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6501 Tuple3<type##_t, size> tup, type##_t *to
6503#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6505#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
6506 Tuple4<type##_t, size> tup, type##_t *to
6508#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
6510#undef HWY_NEON_DEF_FUNCTION_STORE_INT
6511#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
6512#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
6513#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
6516template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6521 detail::StoreInterleaved2(tup, unaligned);
6525template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6526HWY_API void StoreInterleaved2(
const Vec128<T, N> v0,
const Vec128<T, N> v1,
6529 alignas(16) T buf[2 * 8 /
sizeof(T)];
6530 detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
6531 detail::StoreInterleaved2(tup, buf);
6532 CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
6537template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6540 const Half<
decltype(
d)> dh;
6548template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6553 detail::StoreInterleaved3(tup, unaligned);
6557template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6558HWY_API void StoreInterleaved3(
const Vec128<T, N> v0,
const Vec128<T, N> v1,
6559 const Vec128<T, N> v2, Simd<T, N, 0> ,
6561 alignas(16) T buf[3 * 8 /
sizeof(T)];
6562 detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
6563 detail::StoreInterleaved3(tup, buf);
6564 CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
6569template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6571 const Vec128<T> v2, Full128<T> d,
6573 const Half<
decltype(
d)> dh;
6583template <
typename T,
size_t N, HWY_IF_STORE_INT(T, N)>
6589 detail::StoreInterleaved4(tup, unaligned);
6593template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
6594HWY_API void StoreInterleaved4(
const Vec128<T, N> v0,
const Vec128<T, N> v1,
6595 const Vec128<T, N> v2,
const Vec128<T, N> v3,
6598 alignas(16) T buf[4 * 8 /
sizeof(T)];
6599 detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
6600 detail::StoreInterleaved4(tup, buf);
6601 CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
6606template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
6608 const Vec128<T> v2,
const Vec128<T> v3,
6610 const Half<
decltype(
d)> dh;
6618#undef HWY_IF_STORE_INT
6622template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6625 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
6650template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6659template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6662 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
6667template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6676template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6679 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"T must be u64");
6684template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
6774#undef HWY_NEON_BUILD_ARG_1
6775#undef HWY_NEON_BUILD_ARG_2
6776#undef HWY_NEON_BUILD_ARG_3
6777#undef HWY_NEON_BUILD_PARAM_1
6778#undef HWY_NEON_BUILD_PARAM_2
6779#undef HWY_NEON_BUILD_PARAM_3
6780#undef HWY_NEON_BUILD_RET_1
6781#undef HWY_NEON_BUILD_RET_2
6782#undef HWY_NEON_BUILD_RET_3
6783#undef HWY_NEON_BUILD_TPL_1
6784#undef HWY_NEON_BUILD_TPL_2
6785#undef HWY_NEON_BUILD_TPL_3
6786#undef HWY_NEON_DEF_FUNCTION
6787#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
6788#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
6789#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
6790#undef HWY_NEON_DEF_FUNCTION_FULL_UI
6791#undef HWY_NEON_DEF_FUNCTION_INT_16
6792#undef HWY_NEON_DEF_FUNCTION_INT_32
6793#undef HWY_NEON_DEF_FUNCTION_INT_8
6794#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
6795#undef HWY_NEON_DEF_FUNCTION_INTS
6796#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
6797#undef HWY_NEON_DEF_FUNCTION_TPL
6798#undef HWY_NEON_DEF_FUNCTION_UIF81632
6799#undef HWY_NEON_DEF_FUNCTION_UINT_16
6800#undef HWY_NEON_DEF_FUNCTION_UINT_32
6801#undef HWY_NEON_DEF_FUNCTION_UINT_8
6802#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
6803#undef HWY_NEON_DEF_FUNCTION_UINTS
#define HWY_NEON_BUILD_RET_2(type, size)
Definition arm_neon-inl.h:53
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
Definition arm_neon-inl.h:166
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)
Definition arm_neon-inl.h:189
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)
Definition arm_neon-inl.h:199
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:178
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
Definition arm_neon-inl.h:145
#define HWY_NEON_BUILD_ARG_3
Definition arm_neon-inl.h:68
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
Definition arm_neon-inl.h:140
#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:6487
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:98
#define HWY_NEON_BUILD_ARG_2
Definition arm_neon-inl.h:67
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:194
#define HWY_NEON_BUILD_PARAM_2(type, size)
Definition arm_neon-inl.h:58
#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)
Definition arm_neon-inl.h:209
#define HWY_NEON_BUILD_TPL_1
Definition arm_neon-inl.h:46
#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix)
Definition arm_neon-inl.h:5213
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
Definition arm_neon-inl.h:128
#define HWY_NEON_BUILD_TPL_2
Definition arm_neon-inl.h:47
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)
Definition arm_neon-inl.h:172
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)
Definition arm_neon-inl.h:83
#define HWY_NEON_EVAL(func,...)
Definition arm_neon-inl.h:77
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:114
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)
Definition arm_neon-inl.h:121
#define HWY_NEON_BUILD_TPL_3
Definition arm_neon-inl.h:48
#define HWY_NEON_BUILD_RET_3(type, size)
Definition arm_neon-inl.h:54
#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args)
Definition arm_neon-inl.h:203
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
Definition arm_neon-inl.h:184
#define HWY_NEON_BUILD_PARAM_1(type, size)
Definition arm_neon-inl.h:57
#define HWY_NEON_BUILD_RET_1(type, size)
Definition arm_neon-inl.h:52
#define HWY_NEON_BUILD_ARG_1
Definition arm_neon-inl.h:66
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)
Definition arm_neon-inl.h:106
#define HWY_NEON_BUILD_PARAM_3(type, size)
Definition arm_neon-inl.h:60
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
Definition arm_neon-inl.h:134
#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args)
Definition arm_neon-inl.h:6307
#define HWY_IF_FLOAT(T)
Definition base.h:417
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_IF_LE64(T, N)
Definition base.h:407
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_IF_NOT_FLOAT(T)
Definition base.h:418
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_ASSERT(condition)
Definition base.h:192
#define HWY_CONCAT(a, b)
Definition base.h:132
#define HWY_IF_UNSIGNED(T)
Definition base.h:414
Definition arm_neon-inl.h:825
HWY_INLINE Mask128()
Definition arm_neon-inl.h:830
Mask128(const Mask128 &)=default
Mask128 & operator=(const Mask128 &)=default
HWY_INLINE Mask128(const Raw raw)
Definition arm_neon-inl.h:833
Raw raw
Definition arm_neon-inl.h:835
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:827
Definition arm_neon-inl.h:778
HWY_INLINE Vec128()
Definition arm_neon-inl.h:785
T PrivateT
Definition arm_neon-inl.h:782
HWY_INLINE Vec128(const Raw raw)
Definition arm_neon-inl.h:788
Vec128(const Vec128 &)=default
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition arm_neon-inl.h:795
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:779
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition arm_neon-inl.h:801
Vec128 & operator=(const Vec128 &)=default
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition arm_neon-inl.h:810
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition arm_neon-inl.h:807
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition arm_neon-inl.h:792
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition arm_neon-inl.h:804
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition arm_neon-inl.h:798
#define HWY_COMPILER_GCC_ACTUAL
Definition detect_compiler_arch.h:109
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_INLINE Vec128< float > ReciprocalNewtonRaphsonStep(const Vec128< float > recip, const Vec128< float > divisor)
Definition arm_neon-inl.h:1748
HWY_INLINE Vec128< uint8_t > Load8Bytes(Full128< uint8_t >, const uint8_t *bytes)
Definition arm_neon-inl.h:5736
HWY_INLINE Vec128< T, N > Set64(Simd< T, N, 0 >, uint64_t mask_bits)
Definition arm_neon-inl.h:5354
HWY_INLINE Vec128< T, N > IdxFromNotBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5902
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_INLINE uint64_t NibblesFromMask(const Full128< T > d, Mask128< T > mask)
Definition arm_neon-inl.h:5421
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > IdxFromBits(hwy::SizeTag< 2 >, const uint64_t mask_bits)
Definition arm_neon-inl.h:5750
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:5589
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Vec128< float > ReciprocalSqrtStep(const Vec128< float > root, const Vec128< float > recip)
Definition arm_neon-inl.h:1899
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API void LoadInterleaved2(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1)
Definition arm_neon-inl.h:6349
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API void StoreInterleaved4(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, const Vec128< T, N > v3, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6584
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
typename D::Twice Twice
Definition ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API void LoadInterleaved3(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2)
Definition arm_neon-inl.h:6387
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API void LoadInterleaved4(Simd< T, N, 0 >, const T *HWY_RESTRICT unaligned, Vec128< T, N > &v0, Vec128< T, N > &v1, Vec128< T, N > &v2, Vec128< T, N > &v3)
Definition arm_neon-inl.h:6428
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API void StoreInterleaved2(const Vec128< T, N > v0, const Vec128< T, N > v1, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6517
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API void StoreInterleaved3(const Vec128< T, N > v0, const Vec128< T, N > v1, const Vec128< T, N > v2, Simd< T, N, 0 >, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6549
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:950
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:607
double float64_t
Definition base.h:303
typename EnableIfT< Condition >::type EnableIf
Definition base.h:383
float float32_t
Definition base.h:302
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition arm_neon-inl.h:5729
Definition arm_neon-inl.h:3968
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:3969
Definition ops/shared-inl.h:52
uint16x4_t type
Definition arm_neon-inl.h:706
uint16x8_t type
Definition arm_neon-inl.h:643
uint16x4_t type
Definition arm_neon-inl.h:701
uint16x8_t type
Definition arm_neon-inl.h:638
float32x2_t type
Definition arm_neon-inl.h:711
float32x4_t type
Definition arm_neon-inl.h:648
int16x4_t type
Definition arm_neon-inl.h:686
int16x8_t type
Definition arm_neon-inl.h:623
int32x2_t type
Definition arm_neon-inl.h:691
int32x4_t type
Definition arm_neon-inl.h:628
int64x1_t type
Definition arm_neon-inl.h:696
int64x2_t type
Definition arm_neon-inl.h:633
int8x16_t type
Definition arm_neon-inl.h:618
int8x8_t type
Definition arm_neon-inl.h:681
uint16x4_t type
Definition arm_neon-inl.h:666
uint16x8_t type
Definition arm_neon-inl.h:603
uint32x2_t type
Definition arm_neon-inl.h:671
uint32x4_t type
Definition arm_neon-inl.h:608
uint64x1_t type
Definition arm_neon-inl.h:676
uint64x2_t type
Definition arm_neon-inl.h:613
uint8x16_t type
Definition arm_neon-inl.h:598
uint8x8_t type
Definition arm_neon-inl.h:661
Definition x86_128-inl.h:55
__v128_u type
Definition wasm_128-inl.h:61
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:3639
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition arm_neon-inl.h:3646
Definition arm_neon-inl.h:3617
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:3627
HWY_INLINE Vec128< T > operator()(const Vec128< T > v)
Definition arm_neon-inl.h:3620
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N > v)
Definition arm_neon-inl.h:3669
HWY_INLINE Vec128< T, N > operator()(const Vec128< T, N >)
Definition arm_neon-inl.h:3676
Definition arm_neon-inl.h:3652
HWY_INLINE Vec128< T, N > operator()(Vec128< T, N > v)
Definition arm_neon-inl.h:3654
uint16x8x2_t raw
Definition arm_neon-inl.h:364
uint16x4x2_t raw
Definition arm_neon-inl.h:368
uint16x8x2_t raw
Definition arm_neon-inl.h:356
uint16x4x2_t raw
Definition arm_neon-inl.h:360
float32x4x2_t raw
Definition arm_neon-inl.h:373
float32x2x2_t raw
Definition arm_neon-inl.h:377
int16x8x2_t raw
Definition arm_neon-inl.h:315
int16x4x2_t raw
Definition arm_neon-inl.h:319
int32x4x2_t raw
Definition arm_neon-inl.h:331
int32x2x2_t raw
Definition arm_neon-inl.h:335
int64x2x2_t raw
Definition arm_neon-inl.h:347
int64x1x2_t raw
Definition arm_neon-inl.h:351
int8x16x2_t raw
Definition arm_neon-inl.h:299
int8x8x2_t raw
Definition arm_neon-inl.h:303
uint16x8x2_t raw
Definition arm_neon-inl.h:307
uint16x4x2_t raw
Definition arm_neon-inl.h:311
uint32x4x2_t raw
Definition arm_neon-inl.h:323
uint32x2x2_t raw
Definition arm_neon-inl.h:327
uint64x2x2_t raw
Definition arm_neon-inl.h:339
uint64x1x2_t raw
Definition arm_neon-inl.h:343
uint8x16x2_t raw
Definition arm_neon-inl.h:291
uint8x8x2_t raw
Definition arm_neon-inl.h:295
Definition arm_neon-inl.h:283
uint16x8x3_t raw
Definition arm_neon-inl.h:465
uint16x4x3_t raw
Definition arm_neon-inl.h:469
uint16x8x3_t raw
Definition arm_neon-inl.h:457
uint16x4x3_t raw
Definition arm_neon-inl.h:461
float32x4x3_t raw
Definition arm_neon-inl.h:474
float32x2x3_t raw
Definition arm_neon-inl.h:478
int16x8x3_t raw
Definition arm_neon-inl.h:416
int16x4x3_t raw
Definition arm_neon-inl.h:420
int32x4x3_t raw
Definition arm_neon-inl.h:432
int32x2x3_t raw
Definition arm_neon-inl.h:436
int64x2x3_t raw
Definition arm_neon-inl.h:448
int64x1x3_t raw
Definition arm_neon-inl.h:452
int8x16x3_t raw
Definition arm_neon-inl.h:400
int8x8x3_t raw
Definition arm_neon-inl.h:404
uint16x8x3_t raw
Definition arm_neon-inl.h:408
uint16x4x3_t raw
Definition arm_neon-inl.h:412
uint32x4x3_t raw
Definition arm_neon-inl.h:424
uint32x2x3_t raw
Definition arm_neon-inl.h:428
uint64x2x3_t raw
Definition arm_neon-inl.h:440
uint64x1x3_t raw
Definition arm_neon-inl.h:444
uint8x16x3_t raw
Definition arm_neon-inl.h:392
uint8x8x3_t raw
Definition arm_neon-inl.h:396
Definition arm_neon-inl.h:285
uint16x8x4_t raw
Definition arm_neon-inl.h:566
uint16x4x4_t raw
Definition arm_neon-inl.h:570
uint16x8x4_t raw
Definition arm_neon-inl.h:558
uint16x4x4_t raw
Definition arm_neon-inl.h:562
float32x4x4_t raw
Definition arm_neon-inl.h:575
float32x2x4_t raw
Definition arm_neon-inl.h:579
int16x8x4_t raw
Definition arm_neon-inl.h:517
int16x4x4_t raw
Definition arm_neon-inl.h:521
int32x4x4_t raw
Definition arm_neon-inl.h:533
int32x2x4_t raw
Definition arm_neon-inl.h:537
int64x2x4_t raw
Definition arm_neon-inl.h:549
int64x1x4_t raw
Definition arm_neon-inl.h:553
int8x16x4_t raw
Definition arm_neon-inl.h:501
int8x8x4_t raw
Definition arm_neon-inl.h:505
uint16x8x4_t raw
Definition arm_neon-inl.h:509
uint16x4x4_t raw
Definition arm_neon-inl.h:513
uint32x4x4_t raw
Definition arm_neon-inl.h:525
uint32x2x4_t raw
Definition arm_neon-inl.h:529
uint64x2x4_t raw
Definition arm_neon-inl.h:541
uint64x1x4_t raw
Definition arm_neon-inl.h:545
uint8x16x4_t raw
Definition arm_neon-inl.h:493
uint8x8x4_t raw
Definition arm_neon-inl.h:497
Definition arm_neon-inl.h:287