Grok 10.0.5
x86_128-inl.h
Go to the documentation of this file.
1// Copyright 2019 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
17// operations when compiling for those targets.
18// External include guard in highway.h - see comment there.
19
20// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
21#include "hwy/base.h"
22
23// Avoid uninitialized warnings in GCC's emmintrin.h - see
24// https://github.com/google/highway/issues/710 and pull/902
26#if HWY_COMPILER_GCC_ACTUAL
27HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
28HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
29#endif
30
31#include <emmintrin.h>
32#include <stdio.h>
34#include <tmmintrin.h> // SSSE3
35#else
36#include <smmintrin.h> // SSE4
37#include <wmmintrin.h> // CLMUL
38#endif
39#include <stddef.h>
40#include <stdint.h>
41#include <string.h> // memcpy
42
43#include "hwy/ops/shared-inl.h"
44
46#include <sanitizer/msan_interface.h>
47#endif
48
50namespace hwy {
51namespace HWY_NAMESPACE {
52namespace detail {
53
54template <typename T>
55struct Raw128 {
56 using type = __m128i;
57};
58template <>
59struct Raw128<float> {
60 using type = __m128;
61};
62template <>
63struct Raw128<double> {
64 using type = __m128d;
65};
66
67} // namespace detail
68
69template <typename T, size_t N = 16 / sizeof(T)>
70class Vec128 {
71 using Raw = typename detail::Raw128<T>::type;
72
73 public:
74 using PrivateT = T; // only for DFromV
75 static constexpr size_t kPrivateN = N; // only for DFromV
76
77 // Compound assignment. Only usable if there is a corresponding non-member
78 // binary operator overload. For example, only f32 and f64 support division.
80 return *this = (*this * other);
81 }
83 return *this = (*this / other);
84 }
86 return *this = (*this + other);
87 }
89 return *this = (*this - other);
90 }
92 return *this = (*this & other);
93 }
95 return *this = (*this | other);
96 }
98 return *this = (*this ^ other);
99 }
100
101 Raw raw;
102};
103
104template <typename T>
105using Vec64 = Vec128<T, 8 / sizeof(T)>;
106
107template <typename T>
108using Vec32 = Vec128<T, 4 / sizeof(T)>;
109
110#if HWY_TARGET <= HWY_AVX3
111
112namespace detail {
113
114// Template arg: sizeof(lane type)
115template <size_t size>
116struct RawMask128 {};
117template <>
119 using type = __mmask16;
120};
121template <>
123 using type = __mmask8;
124};
125template <>
127 using type = __mmask8;
128};
129template <>
131 using type = __mmask8;
132};
133
134} // namespace detail
135
136template <typename T, size_t N = 16 / sizeof(T)>
137struct Mask128 {
138 using Raw = typename detail::RawMask128<sizeof(T)>::type;
139
140 static Mask128<T, N> FromBits(uint64_t mask_bits) {
141 return Mask128<T, N>{static_cast<Raw>(mask_bits)};
142 }
143
144 Raw raw;
145};
146
147#else // AVX2 or below
148
149// FF..FF or 0.
150template <typename T, size_t N = 16 / sizeof(T)>
151struct Mask128 {
152 typename detail::Raw128<T>::type raw;
153};
154
155#endif // HWY_TARGET <= HWY_AVX3
156
157template <class V>
158using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
159
160template <class V>
161using TFromV = typename V::PrivateT;
162
163// ------------------------------ BitCast
164
165namespace detail {
166
167HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
168HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
169HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
170
171template <typename T, size_t N>
172HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
173 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
174}
175
176// Cannot rely on function overloading because return types differ.
177template <typename T>
178struct BitCastFromInteger128 {
179 HWY_INLINE __m128i operator()(__m128i v) { return v; }
180};
181template <>
182struct BitCastFromInteger128<float> {
183 HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
184};
185template <>
187 HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
188};
189
190template <typename T, size_t N>
191HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N, 0> /* tag */,
192 Vec128<uint8_t, N * sizeof(T)> v) {
193 return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
194}
195
196} // namespace detail
197
198template <typename T, size_t N, typename FromT>
199HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
200 Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
202}
203
204// ------------------------------ Zero
205
206// Returns an all-zero vector/part.
207template <typename T, size_t N, HWY_IF_LE128(T, N)>
208HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
209 return Vec128<T, N>{_mm_setzero_si128()};
210}
211template <size_t N, HWY_IF_LE128(float, N)>
212HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
213 return Vec128<float, N>{_mm_setzero_ps()};
214}
215template <size_t N, HWY_IF_LE128(double, N)>
217 return Vec128<double, N>{_mm_setzero_pd()};
218}
219
220template <class D>
221using VFromD = decltype(Zero(D()));
222
223// ------------------------------ Set
224
225// Returns a vector/part with all lanes set to "t".
226template <size_t N, HWY_IF_LE128(uint8_t, N)>
227HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
228 return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
229}
230template <size_t N, HWY_IF_LE128(uint16_t, N)>
231HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
232 const uint16_t t) {
233 return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
234}
235template <size_t N, HWY_IF_LE128(uint32_t, N)>
236HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
237 const uint32_t t) {
238 return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
239}
240template <size_t N, HWY_IF_LE128(uint64_t, N)>
241HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
242 const uint64_t t) {
243 return Vec128<uint64_t, N>{
244 _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
245}
246template <size_t N, HWY_IF_LE128(int8_t, N)>
247HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
248 return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
249}
250template <size_t N, HWY_IF_LE128(int16_t, N)>
251HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
252 return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
253}
254template <size_t N, HWY_IF_LE128(int32_t, N)>
255HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
256 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
257}
258template <size_t N, HWY_IF_LE128(int64_t, N)>
259HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
260 return Vec128<int64_t, N>{
261 _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
262}
263template <size_t N, HWY_IF_LE128(float, N)>
264HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
265 return Vec128<float, N>{_mm_set1_ps(t)};
266}
267template <size_t N, HWY_IF_LE128(double, N)>
269 return Vec128<double, N>{_mm_set1_pd(t)};
270}
271
272HWY_DIAGNOSTICS(push)
273HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
274
275// Returns a vector with uninitialized elements.
276template <typename T, size_t N, HWY_IF_LE128(T, N)>
277HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /* tag */) {
278 // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
279 // generate an XOR instruction.
280 return Vec128<T, N>{_mm_undefined_si128()};
281}
282template <size_t N, HWY_IF_LE128(float, N)>
284 return Vec128<float, N>{_mm_undefined_ps()};
285}
286template <size_t N, HWY_IF_LE128(double, N)>
288 return Vec128<double, N>{_mm_undefined_pd()};
289}
290
292
293// ------------------------------ GetLane
294
295// Gets the single value stored in a vector/part.
296template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
298 return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
299}
300template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
301HWY_API T GetLane(const Vec128<T, N> v) {
302 return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
303}
304template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
305HWY_API T GetLane(const Vec128<T, N> v) {
306 return static_cast<T>(_mm_cvtsi128_si32(v.raw));
307}
308template <size_t N>
310 return _mm_cvtss_f32(v.raw);
311}
312template <size_t N>
314#if HWY_ARCH_X86_32
315 alignas(16) uint64_t lanes[2];
316 Store(v, Simd<uint64_t, N, 0>(), lanes);
317 return lanes[0];
318#else
319 return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
320#endif
321}
322template <size_t N>
324#if HWY_ARCH_X86_32
325 alignas(16) int64_t lanes[2];
326 Store(v, Simd<int64_t, N, 0>(), lanes);
327 return lanes[0];
328#else
329 return _mm_cvtsi128_si64(v.raw);
330#endif
331}
332template <size_t N>
334 return _mm_cvtsd_f64(v.raw);
335}
336
337// ================================================== LOGICAL
338
339// ------------------------------ And
340
341template <typename T, size_t N>
342HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
343 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
344}
345template <size_t N>
347 const Vec128<float, N> b) {
348 return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
349}
350template <size_t N>
352 const Vec128<double, N> b) {
353 return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
354}
355
356// ------------------------------ AndNot
357
358// Returns ~not_mask & mask.
359template <typename T, size_t N>
360HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
361 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
362}
363template <size_t N>
365 const Vec128<float, N> mask) {
366 return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
367}
368template <size_t N>
370 const Vec128<double, N> mask) {
371 return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
372}
373
374// ------------------------------ Or
375
376template <typename T, size_t N>
377HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
378 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
379}
380
381template <size_t N>
383 const Vec128<float, N> b) {
384 return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
385}
386template <size_t N>
388 const Vec128<double, N> b) {
389 return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
390}
391
392// ------------------------------ Xor
393
394template <typename T, size_t N>
395HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
396 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
397}
398
399template <size_t N>
401 const Vec128<float, N> b) {
402 return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
403}
404template <size_t N>
406 const Vec128<double, N> b) {
407 return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
408}
409
410// ------------------------------ Not
411template <typename T, size_t N>
412HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
413 const DFromV<decltype(v)> d;
414 const RebindToUnsigned<decltype(d)> du;
415 using VU = VFromD<decltype(du)>;
416#if HWY_TARGET <= HWY_AVX3
417 const __m128i vu = BitCast(du, v).raw;
418 return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
419#else
420 return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
421#endif
422}
423
424// ------------------------------ Xor3
425template <typename T, size_t N>
426HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
427#if HWY_TARGET <= HWY_AVX3
428 const DFromV<decltype(x1)> d;
429 const RebindToUnsigned<decltype(d)> du;
430 using VU = VFromD<decltype(du)>;
431 const __m128i ret = _mm_ternarylogic_epi64(
432 BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96);
433 return BitCast(d, VU{ret});
434#else
435 return Xor(x1, Xor(x2, x3));
436#endif
437}
438
439// ------------------------------ Or3
440template <typename T, size_t N>
441HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
442#if HWY_TARGET <= HWY_AVX3
443 const DFromV<decltype(o1)> d;
444 const RebindToUnsigned<decltype(d)> du;
445 using VU = VFromD<decltype(du)>;
446 const __m128i ret = _mm_ternarylogic_epi64(
447 BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
448 return BitCast(d, VU{ret});
449#else
450 return Or(o1, Or(o2, o3));
451#endif
452}
453
454// ------------------------------ OrAnd
455template <typename T, size_t N>
456HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
457#if HWY_TARGET <= HWY_AVX3
458 const DFromV<decltype(o)> d;
459 const RebindToUnsigned<decltype(d)> du;
460 using VU = VFromD<decltype(du)>;
461 const __m128i ret = _mm_ternarylogic_epi64(
462 BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
463 return BitCast(d, VU{ret});
464#else
465 return Or(o, And(a1, a2));
466#endif
467}
468
469// ------------------------------ IfVecThenElse
470template <typename T, size_t N>
471HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
472 Vec128<T, N> no) {
473#if HWY_TARGET <= HWY_AVX3
474 const DFromV<decltype(no)> d;
475 const RebindToUnsigned<decltype(d)> du;
476 using VU = VFromD<decltype(du)>;
477 return BitCast(
478 d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
479 BitCast(du, no).raw, 0xCA)});
480#else
481 return IfThenElse(MaskFromVec(mask), yes, no);
482#endif
483}
484
485// ------------------------------ Operator overloads (internal-only if float)
486
487template <typename T, size_t N>
488HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
489 return And(a, b);
490}
491
492template <typename T, size_t N>
493HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
494 return Or(a, b);
495}
496
497template <typename T, size_t N>
498HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
499 return Xor(a, b);
500}
501
502// ------------------------------ PopulationCount
503
504// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
505#if HWY_TARGET == HWY_AVX3_DL
506
507#ifdef HWY_NATIVE_POPCNT
508#undef HWY_NATIVE_POPCNT
509#else
510#define HWY_NATIVE_POPCNT
511#endif
512
513namespace detail {
514
515template <typename T, size_t N>
517 Vec128<T, N> v) {
518 return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
519}
520template <typename T, size_t N>
522 Vec128<T, N> v) {
523 return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
524}
525template <typename T, size_t N>
527 Vec128<T, N> v) {
528 return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
529}
530template <typename T, size_t N>
532 Vec128<T, N> v) {
533 return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
534}
535
536} // namespace detail
537
538template <typename T, size_t N>
542
543#endif // HWY_TARGET == HWY_AVX3_DL
544
545// ================================================== SIGN
546
547// ------------------------------ Neg
548
549// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
550namespace detail {
551
552template <typename T, size_t N>
554 return Xor(v, SignBit(DFromV<decltype(v)>()));
555}
556
557template <typename T, size_t N>
559 return Zero(DFromV<decltype(v)>()) - v;
560}
561
562} // namespace detail
563
564template <typename T, size_t N>
566 return detail::Neg(hwy::IsFloatTag<T>(), v);
567}
568
569// ------------------------------ Abs
570
571// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
572template <size_t N>
573HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
574#if HWY_COMPILER_MSVC
575 // Workaround for incorrect codegen? (reaches breakpoint)
576 const auto zero = Zero(DFromV<decltype(v)>());
577 return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
578#else
579 return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
580#endif
581}
582template <size_t N>
583HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
584 return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
585}
586template <size_t N>
587HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
588 return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
589}
590// i64 is implemented after BroadcastSignBit.
591template <size_t N>
592HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
593 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
594 return v & BitCast(DFromV<decltype(v)>(), mask);
595}
596template <size_t N>
598 const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
599 return v & BitCast(DFromV<decltype(v)>(), mask);
600}
601
602// ------------------------------ CopySign
603
604template <typename T, size_t N>
605HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
606 const Vec128<T, N> sign) {
607 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
608
609 const DFromV<decltype(magn)> d;
610 const auto msb = SignBit(d);
611
612#if HWY_TARGET <= HWY_AVX3
613 const RebindToUnsigned<decltype(d)> du;
614 // Truth table for msb, magn, sign | bitwise msb ? sign : mag
615 // 0 0 0 | 0
616 // 0 0 1 | 0
617 // 0 1 0 | 1
618 // 0 1 1 | 1
619 // 1 0 0 | 0
620 // 1 0 1 | 1
621 // 1 1 0 | 0
622 // 1 1 1 | 1
623 // The lane size does not matter because we are not using predication.
624 const __m128i out = _mm_ternarylogic_epi32(
625 BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
626 return BitCast(d, VFromD<decltype(du)>{out});
627#else
628 return Or(AndNot(msb, magn), And(msb, sign));
629#endif
630}
631
632template <typename T, size_t N>
633HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
634 const Vec128<T, N> sign) {
635#if HWY_TARGET <= HWY_AVX3
636 // AVX3 can also handle abs < 0, so no extra action needed.
637 return CopySign(abs, sign);
638#else
639 return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
640#endif
641}
642
643// ================================================== MASK
644
645namespace detail {
646
647template <typename T>
648HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
649 // Workaround for MSAN not marking compressstore as initialized (b/233326619)
650#if HWY_IS_MSAN
651 __msan_unpoison(unaligned, count * sizeof(T));
652#else
653 (void)unaligned;
654 (void)count;
655#endif
656}
657
658} // namespace detail
659
660#if HWY_TARGET <= HWY_AVX3
661
662// ------------------------------ IfThenElse
663
664// Returns mask ? b : a.
665
666namespace detail {
667
668// Templates for signed/unsigned integer of a particular size.
669template <typename T, size_t N>
671 Mask128<T, N> mask, Vec128<T, N> yes,
672 Vec128<T, N> no) {
673 return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
674}
675template <typename T, size_t N>
677 Mask128<T, N> mask, Vec128<T, N> yes,
678 Vec128<T, N> no) {
679 return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
680}
681template <typename T, size_t N>
683 Mask128<T, N> mask, Vec128<T, N> yes,
684 Vec128<T, N> no) {
685 return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
686}
687template <typename T, size_t N>
689 Mask128<T, N> mask, Vec128<T, N> yes,
690 Vec128<T, N> no) {
691 return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
692}
693
694} // namespace detail
695
696template <typename T, size_t N>
697HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
698 Vec128<T, N> no) {
699 return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
700}
701
702template <size_t N>
705 return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
706}
707
708template <size_t N>
712 return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
713}
714
715namespace detail {
716
717template <typename T, size_t N>
719 Mask128<T, N> mask, Vec128<T, N> yes) {
720 return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
721}
722template <typename T, size_t N>
724 Mask128<T, N> mask, Vec128<T, N> yes) {
725 return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
726}
727template <typename T, size_t N>
729 Mask128<T, N> mask, Vec128<T, N> yes) {
730 return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
731}
732template <typename T, size_t N>
734 Mask128<T, N> mask, Vec128<T, N> yes) {
735 return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
736}
737
738} // namespace detail
739
740template <typename T, size_t N>
741HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
742 return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
743}
744
745template <size_t N>
747 Vec128<float, N> yes) {
748 return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
749}
750
751template <size_t N>
753 Vec128<double, N> yes) {
754 return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
755}
756
757namespace detail {
758
759template <typename T, size_t N>
761 Mask128<T, N> mask, Vec128<T, N> no) {
762 // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
763 return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
764}
765template <typename T, size_t N>
767 Mask128<T, N> mask, Vec128<T, N> no) {
768 return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
769}
770template <typename T, size_t N>
772 Mask128<T, N> mask, Vec128<T, N> no) {
773 return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
774}
775template <typename T, size_t N>
777 Mask128<T, N> mask, Vec128<T, N> no) {
778 return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
779}
780
781} // namespace detail
782
783template <typename T, size_t N>
784HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
785 return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
786}
787
788template <size_t N>
790 Vec128<float, N> no) {
791 return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
792}
793
794template <size_t N>
797 return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
798}
799
800// ------------------------------ Mask logical
801
802// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
803#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
804#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
805 HWY_COMPILER_CLANG >= 800
806#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
807#else
808#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
809#endif
810#endif // HWY_COMPILER_HAS_MASK_INTRINSICS
811
812namespace detail {
813
814template <typename T, size_t N>
816 const Mask128<T, N> b) {
817#if HWY_COMPILER_HAS_MASK_INTRINSICS
818 return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
819#else
820 return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
821#endif
822}
823template <typename T, size_t N>
825 const Mask128<T, N> b) {
826#if HWY_COMPILER_HAS_MASK_INTRINSICS
827 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
828#else
829 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
830#endif
831}
832template <typename T, size_t N>
834 const Mask128<T, N> b) {
835#if HWY_COMPILER_HAS_MASK_INTRINSICS
836 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
837#else
838 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
839#endif
840}
841template <typename T, size_t N>
843 const Mask128<T, N> b) {
844#if HWY_COMPILER_HAS_MASK_INTRINSICS
845 return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
846#else
847 return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
848#endif
849}
850
851template <typename T, size_t N>
853 const Mask128<T, N> b) {
854#if HWY_COMPILER_HAS_MASK_INTRINSICS
855 return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
856#else
857 return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
858#endif
859}
860template <typename T, size_t N>
862 const Mask128<T, N> b) {
863#if HWY_COMPILER_HAS_MASK_INTRINSICS
864 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
865#else
866 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
867#endif
868}
869template <typename T, size_t N>
871 const Mask128<T, N> b) {
872#if HWY_COMPILER_HAS_MASK_INTRINSICS
873 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
874#else
875 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
876#endif
877}
878template <typename T, size_t N>
880 const Mask128<T, N> b) {
881#if HWY_COMPILER_HAS_MASK_INTRINSICS
882 return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
883#else
884 return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
885#endif
886}
887
888template <typename T, size_t N>
890 const Mask128<T, N> b) {
891#if HWY_COMPILER_HAS_MASK_INTRINSICS
892 return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
893#else
894 return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
895#endif
896}
897template <typename T, size_t N>
899 const Mask128<T, N> b) {
900#if HWY_COMPILER_HAS_MASK_INTRINSICS
901 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
902#else
903 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
904#endif
905}
906template <typename T, size_t N>
908 const Mask128<T, N> b) {
909#if HWY_COMPILER_HAS_MASK_INTRINSICS
910 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
911#else
912 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
913#endif
914}
915template <typename T, size_t N>
917 const Mask128<T, N> b) {
918#if HWY_COMPILER_HAS_MASK_INTRINSICS
919 return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
920#else
921 return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
922#endif
923}
924
925template <typename T, size_t N>
927 const Mask128<T, N> b) {
928#if HWY_COMPILER_HAS_MASK_INTRINSICS
929 return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
930#else
931 return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
932#endif
933}
934template <typename T, size_t N>
936 const Mask128<T, N> b) {
937#if HWY_COMPILER_HAS_MASK_INTRINSICS
938 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
939#else
940 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
941#endif
942}
943template <typename T, size_t N>
945 const Mask128<T, N> b) {
946#if HWY_COMPILER_HAS_MASK_INTRINSICS
947 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
948#else
949 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
950#endif
951}
952template <typename T, size_t N>
954 const Mask128<T, N> b) {
955#if HWY_COMPILER_HAS_MASK_INTRINSICS
956 return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
957#else
958 return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
959#endif
960}
961
962template <typename T, size_t N>
964 const Mask128<T, N> a,
965 const Mask128<T, N> b) {
966#if HWY_COMPILER_HAS_MASK_INTRINSICS
967 return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
968#else
969 return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
970#endif
971}
972template <typename T, size_t N>
974 const Mask128<T, N> a,
975 const Mask128<T, N> b) {
976#if HWY_COMPILER_HAS_MASK_INTRINSICS
977 return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
978#else
979 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
980#endif
981}
982template <typename T, size_t N>
984 const Mask128<T, N> a,
985 const Mask128<T, N> b) {
986#if HWY_COMPILER_HAS_MASK_INTRINSICS
987 return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
988#else
989 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
990#endif
991}
992template <typename T, size_t N>
994 const Mask128<T, N> a,
995 const Mask128<T, N> b) {
996#if HWY_COMPILER_HAS_MASK_INTRINSICS
997 return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
998#else
999 return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
1000#endif
1001}
1002
1003} // namespace detail
1004
1005template <typename T, size_t N>
1006HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1007 return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
1008}
1009
1010template <typename T, size_t N>
1011HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1012 return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
1013}
1014
1015template <typename T, size_t N>
1016HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1017 return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
1018}
1019
1020template <typename T, size_t N>
1021HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1022 return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
1023}
1024
1025template <typename T, size_t N>
1026HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1027 // Flip only the valid bits.
1028 // TODO(janwas): use _knot intrinsics if N >= 8.
1029 return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
1030}
1031
1032template <typename T, size_t N>
1033HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
1034 return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
1035}
1036
1037#else // AVX2 or below
1038
1039// ------------------------------ Mask
1040
1041// Mask and Vec are the same (true = FF..FF).
1042template <typename T, size_t N>
1043HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1044 return Mask128<T, N>{v.raw};
1045}
1046
1047template <typename T, size_t N>
1048HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1049 return Vec128<T, N>{v.raw};
1050}
1051
1052template <typename T, size_t N>
1053HWY_API Vec128<T, N> VecFromMask(const Simd<T, N, 0> /* tag */,
1054 const Mask128<T, N> v) {
1055 return Vec128<T, N>{v.raw};
1056}
1057
1058#if HWY_TARGET == HWY_SSSE3
1059
1060// mask ? yes : no
1061template <typename T, size_t N>
1062HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1063 Vec128<T, N> no) {
1064 const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
1065 return Or(And(vmask, yes), AndNot(vmask, no));
1066}
1067
1068#else // HWY_TARGET == HWY_SSSE3
1069
1070// mask ? yes : no
1071template <typename T, size_t N>
1072HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1073 Vec128<T, N> no) {
1074 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1075}
1076template <size_t N>
1077HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
1078 const Vec128<float, N> yes,
1079 const Vec128<float, N> no) {
1080 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1081}
1082template <size_t N>
1083HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
1084 const Vec128<double, N> yes,
1085 const Vec128<double, N> no) {
1086 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1087}
1088
1089#endif // HWY_TARGET == HWY_SSSE3
1090
1091// mask ? yes : 0
1092template <typename T, size_t N>
1093HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1094 return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1095}
1096
1097// mask ? 0 : no
1098template <typename T, size_t N>
1099HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1100 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1101}
1102
1103// ------------------------------ Mask logical
1104
1105template <typename T, size_t N>
1106HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1107 return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1108}
1109
1110template <typename T, size_t N>
1111HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1112 const Simd<T, N, 0> d;
1113 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1114}
1115
1116template <typename T, size_t N>
1117HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1118 const Simd<T, N, 0> d;
1119 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1120}
1121
1122template <typename T, size_t N>
1123HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1124 const Simd<T, N, 0> d;
1125 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1126}
1127
1128template <typename T, size_t N>
1129HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1130 const Simd<T, N, 0> d;
1131 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1132}
1133
1134template <typename T, size_t N>
1135HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
1136 const Simd<T, N, 0> d;
1137 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
1138}
1139
1140#endif // HWY_TARGET <= HWY_AVX3
1141
1142// ------------------------------ ShiftLeft
1143
1144template <int kBits, size_t N>
1145HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
1146 return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1147}
1148
1149template <int kBits, size_t N>
1150HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
1151 return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1152}
1153
1154template <int kBits, size_t N>
1155HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
1156 return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1157}
1158
1159template <int kBits, size_t N>
1160HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
1161 return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
1162}
1163template <int kBits, size_t N>
1164HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
1165 return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
1166}
1167template <int kBits, size_t N>
1168HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
1169 return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
1170}
1171
1172template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1173HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
1174 const DFromV<decltype(v)> d8;
1175 // Use raw instead of BitCast to support N=1.
1176 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
1177 return kBits == 1
1178 ? (v + v)
1179 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
1180}
1181
1182// ------------------------------ ShiftRight
1183
1184template <int kBits, size_t N>
1185HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
1186 return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
1187}
1188template <int kBits, size_t N>
1189HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
1190 return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
1191}
1192template <int kBits, size_t N>
1193HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
1194 return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
1195}
1196
1197template <int kBits, size_t N>
1198HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
1199 const DFromV<decltype(v)> d8;
1200 // Use raw instead of BitCast to support N=1.
1201 const Vec128<uint8_t, N> shifted{
1202 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
1203 return shifted & Set(d8, 0xFF >> kBits);
1204}
1205
1206template <int kBits, size_t N>
1207HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
1208 return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
1209}
1210template <int kBits, size_t N>
1211HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
1212 return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
1213}
1214
1215template <int kBits, size_t N>
1216HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
1217 const DFromV<decltype(v)> di;
1218 const RebindToUnsigned<decltype(di)> du;
1219 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
1220 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
1221 return (shifted ^ shifted_sign) - shifted_sign;
1222}
1223
1224// i64 is implemented after BroadcastSignBit.
1225
1226// ================================================== SWIZZLE (1)
1227
1228// ------------------------------ TableLookupBytes
1229template <typename T, size_t N, typename TI, size_t NI>
1230HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
1231 const Vec128<TI, NI> from) {
1232 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
1233}
1234
1235// ------------------------------ TableLookupBytesOr0
1236// For all vector widths; x86 anyway zeroes if >= 0x80.
1237template <class V, class VI>
1238HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
1239 return TableLookupBytes(bytes, from);
1240}
1241
1242// ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
1243
1244// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1245// Shuffle0321 rotates one lane to the right (the previous least-significant
1246// lane is now most-significant). These could also be implemented via
1247// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1248
1249// Swap 32-bit halves in 64-bit halves.
1250template <typename T, size_t N>
1251HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
1252 static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
1253 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1254 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1255}
1256template <size_t N>
1258 static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1259 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
1260}
1261
1262// These are used by generic_ops-inl to implement LoadInterleaved3. As with
1263// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
1264// comes from the first argument.
1265namespace detail {
1266
1267template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1269 const Twice<DFromV<decltype(a)>> d2;
1270 const auto ba = Combine(d2, b, a);
1271 alignas(16) const T kShuffle[8] = {1, 0, 7, 6};
1272 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1273}
1274template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1276 const Twice<DFromV<decltype(a)>> d2;
1277 const auto ba = Combine(d2, b, a);
1278 alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
1279 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1280}
1281template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1282HWY_API Vec128<T, 4> Shuffle2301(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1283 const DFromV<decltype(a)> d;
1284 const RebindToFloat<decltype(d)> df;
1285 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
1286 return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1287 BitCast(df, b).raw, m)});
1288}
1289
1290template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1292 const Twice<DFromV<decltype(a)>> d2;
1293 const auto ba = Combine(d2, b, a);
1294 alignas(16) const T kShuffle[8] = {0, 3, 6, 5};
1295 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1296}
1297template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1298HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1299 const Twice<DFromV<decltype(a)>> d2;
1300 const auto ba = Combine(d2, b, a);
1301 alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
1302 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1303}
1304template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1305HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1306 const DFromV<decltype(a)> d;
1307 const RebindToFloat<decltype(d)> df;
1308 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
1309 return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1310 BitCast(df, b).raw, m)});
1311}
1312
1313template <typename T, HWY_IF_LANE_SIZE(T, 1)>
1315 const Twice<DFromV<decltype(a)>> d2;
1316 const auto ba = Combine(d2, b, a);
1317 alignas(16) const T kShuffle[8] = {2, 1, 4, 7};
1318 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1319}
1320template <typename T, HWY_IF_LANE_SIZE(T, 2)>
1321HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1322 const Twice<DFromV<decltype(a)>> d2;
1323 const auto ba = Combine(d2, b, a);
1324 alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
1325 return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
1326}
1327template <typename T, HWY_IF_LANE_SIZE(T, 4)>
1328HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
1329 const DFromV<decltype(a)> d;
1330 const RebindToFloat<decltype(d)> df;
1331 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
1332 return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
1333 BitCast(df, b).raw, m)});
1334}
1335
1336} // namespace detail
1337
1338// Swap 64-bit halves
1340 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1341}
1343 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1344}
1346 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1347}
1349 return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1350}
1352 return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1353}
1355 return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
1356}
1357
1358// Rotate right 32 bits
1360 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1361}
1363 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1364}
1366 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1367}
1368// Rotate left 32 bits
1370 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1371}
1373 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1374}
1376 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1377}
1378
1379// Reverse
1381 return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1382}
1384 return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1385}
1387 return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1388}
1389
1390// ================================================== COMPARE
1391
1392#if HWY_TARGET <= HWY_AVX3
1393
1394// Comparisons set a mask bit to 1 if the condition is true, else 0.
1395
1396template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
1399 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1400 return Mask128<TTo, NTo>{m.raw};
1401}
1402
1403namespace detail {
1404
1405template <typename T, size_t N>
1407 const Vec128<T, N> bit) {
1408 return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
1409}
1410template <typename T, size_t N>
1412 const Vec128<T, N> bit) {
1413 return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
1414}
1415template <typename T, size_t N>
1417 const Vec128<T, N> bit) {
1418 return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
1419}
1420template <typename T, size_t N>
1422 const Vec128<T, N> bit) {
1423 return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
1424}
1425
1426} // namespace detail
1427
1428template <typename T, size_t N>
1429HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
1430 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1431 return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1432}
1433
1434// ------------------------------ Equality
1435
1436template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1438 return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
1439}
1440
1441template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1442HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1443 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1444}
1445
1446template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1447HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1448 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1449}
1450
1451template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1452HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1453 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1454}
1455
1456template <size_t N>
1457HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
1458 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1459}
1460
1461template <size_t N>
1464 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1465}
1466
1467// ------------------------------ Inequality
1468
1469template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1471 return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
1472}
1473
1474template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1475HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1476 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1477}
1478
1479template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1480HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1481 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1482}
1483
1484template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1485HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1486 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1487}
1488
1489template <size_t N>
1490HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
1491 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1492}
1493
1494template <size_t N>
1497 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1498}
1499
1500// ------------------------------ Strict inequality
1501
1502// Signed/float <
1503template <size_t N>
1504HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1505 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1506}
1507template <size_t N>
1508HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1509 Vec128<int16_t, N> b) {
1510 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1511}
1512template <size_t N>
1513HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1514 Vec128<int32_t, N> b) {
1515 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1516}
1517template <size_t N>
1518HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
1519 Vec128<int64_t, N> b) {
1520 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1521}
1522
1523template <size_t N>
1524HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
1525 Vec128<uint8_t, N> b) {
1526 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1527}
1528template <size_t N>
1529HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
1530 Vec128<uint16_t, N> b) {
1531 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1532}
1533template <size_t N>
1534HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
1535 Vec128<uint32_t, N> b) {
1536 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1537}
1538template <size_t N>
1539HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
1540 Vec128<uint64_t, N> b) {
1541 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1542}
1543
1544template <size_t N>
1545HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1546 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1547}
1548template <size_t N>
1550 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1551}
1552
1553// ------------------------------ Weak inequality
1554
1555template <size_t N>
1556HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
1557 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1558}
1559template <size_t N>
1562 return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1563}
1564
1565// ------------------------------ Mask
1566
1567namespace detail {
1568
1569template <typename T, size_t N>
1571 const Vec128<T, N> v) {
1572 return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1573}
1574template <typename T, size_t N>
1576 const Vec128<T, N> v) {
1577 return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1578}
1579template <typename T, size_t N>
1581 const Vec128<T, N> v) {
1582 return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1583}
1584template <typename T, size_t N>
1586 const Vec128<T, N> v) {
1587 return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1588}
1589
1590} // namespace detail
1591
1592template <typename T, size_t N>
1593HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1594 return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1595}
1596// There do not seem to be native floating-point versions of these instructions.
1597template <size_t N>
1602template <size_t N>
1607
1608template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1610 return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1611}
1612
1613template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1614HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1615 return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1616}
1617
1618template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1619HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1620 return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1621}
1622
1623template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1624HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1625 return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1626}
1627
1628template <size_t N>
1630 return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1631}
1632
1633template <size_t N>
1635 return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1636}
1637
1638template <typename T, size_t N>
1639HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */,
1640 const Mask128<T, N> v) {
1641 return VecFromMask(v);
1642}
1643
1644#else // AVX2 or below
1645
1646// Comparisons fill a lane with 1-bits if the condition is true, else 0.
1647
1648template <typename TFrom, typename TTo, size_t N>
1649HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
1650 Mask128<TFrom, N> m) {
1651 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1652 const Simd<TFrom, N, 0> d;
1653 return MaskFromVec(BitCast(Simd<TTo, N, 0>(), VecFromMask(d, m)));
1654}
1655
1656template <typename T, size_t N>
1657HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1658 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1659 return (v & bit) == bit;
1660}
1661
1662// ------------------------------ Equality
1663
1664// Unsigned
1665template <size_t N>
1666HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
1667 const Vec128<uint8_t, N> b) {
1668 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1669}
1670template <size_t N>
1671HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
1672 const Vec128<uint16_t, N> b) {
1673 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1674}
1675template <size_t N>
1676HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
1677 const Vec128<uint32_t, N> b) {
1678 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1679}
1680template <size_t N>
1681HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1682 const Vec128<uint64_t, N> b) {
1683#if HWY_TARGET == HWY_SSSE3
1684 const Simd<uint32_t, N * 2, 0> d32;
1685 const Simd<uint64_t, N, 0> d64;
1686 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1687 const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1688 return MaskFromVec(BitCast(d64, cmp64));
1689#else
1690 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1691#endif
1692}
1693
1694// Signed
1695template <size_t N>
1696HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
1697 const Vec128<int8_t, N> b) {
1698 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1699}
1700template <size_t N>
1701HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
1702 Vec128<int16_t, N> b) {
1703 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1704}
1705template <size_t N>
1706HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
1707 const Vec128<int32_t, N> b) {
1708 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1709}
1710template <size_t N>
1711HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1712 const Vec128<int64_t, N> b) {
1713 // Same as signed ==; avoid duplicating the SSSE3 version.
1714 const DFromV<decltype(a)> d;
1715 RebindToUnsigned<decltype(d)> du;
1716 return RebindMask(d, BitCast(du, a) == BitCast(du, b));
1717}
1718
1719// Float
1720template <size_t N>
1721HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
1722 const Vec128<float, N> b) {
1723 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1724}
1725template <size_t N>
1726HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
1727 const Vec128<double, N> b) {
1728 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1729}
1730
1731// ------------------------------ Inequality
1732
1733// This cannot have T as a template argument, otherwise it is not more
1734// specialized than rewritten operator== in C++20, leading to compile
1735// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
1736template <size_t N>
1737HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
1738 Vec128<uint8_t, N> b) {
1739 return Not(a == b);
1740}
1741template <size_t N>
1742HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
1743 Vec128<uint16_t, N> b) {
1744 return Not(a == b);
1745}
1746template <size_t N>
1747HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
1748 Vec128<uint32_t, N> b) {
1749 return Not(a == b);
1750}
1751template <size_t N>
1752HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
1753 Vec128<uint64_t, N> b) {
1754 return Not(a == b);
1755}
1756template <size_t N>
1757HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
1758 Vec128<int8_t, N> b) {
1759 return Not(a == b);
1760}
1761template <size_t N>
1762HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
1763 Vec128<int16_t, N> b) {
1764 return Not(a == b);
1765}
1766template <size_t N>
1767HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
1768 Vec128<int32_t, N> b) {
1769 return Not(a == b);
1770}
1771template <size_t N>
1772HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
1773 Vec128<int64_t, N> b) {
1774 return Not(a == b);
1775}
1776
1777template <size_t N>
1778HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1779 const Vec128<float, N> b) {
1780 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1781}
1782template <size_t N>
1783HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1784 const Vec128<double, N> b) {
1785 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1786}
1787
1788// ------------------------------ Strict inequality
1789
1790namespace detail {
1791
1792template <size_t N>
1793HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
1794 Vec128<int8_t, N> b) {
1795 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1796}
1797template <size_t N>
1798HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
1799 Vec128<int16_t, N> b) {
1800 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1801}
1802template <size_t N>
1803HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
1804 Vec128<int32_t, N> b) {
1805 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1806}
1807
1808template <size_t N>
1809HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
1810 const Vec128<int64_t, N> a,
1811 const Vec128<int64_t, N> b) {
1812#if HWY_TARGET == HWY_SSSE3
1813 // See https://stackoverflow.com/questions/65166174/:
1814 const Simd<int64_t, N, 0> d;
1815 const RepartitionToNarrow<decltype(d)> d32;
1816 const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
1817 const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
1818 // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
1819 // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
1820 const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
1821 // Duplicate upper to lower half.
1822 return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
1823#else
1824 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2
1825#endif
1826}
1827
1828template <typename T, size_t N>
1829HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
1830 Vec128<T, N> b) {
1831 const DFromV<decltype(a)> du;
1832 const RebindToSigned<decltype(du)> di;
1833 const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1834 const auto sa = BitCast(di, Xor(a, msb));
1835 const auto sb = BitCast(di, Xor(b, msb));
1836 return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
1837}
1838
1839template <size_t N>
1840HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
1841 Vec128<float, N> b) {
1842 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1843}
1844template <size_t N>
1845HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
1846 Vec128<double, N> b) {
1847 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1848}
1849
1850} // namespace detail
1851
1852template <typename T, size_t N>
1853HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1854 return detail::Gt(hwy::TypeTag<T>(), a, b);
1855}
1856
1857// ------------------------------ Weak inequality
1858
1859template <size_t N>
1860HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
1861 const Vec128<float, N> b) {
1862 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1863}
1864template <size_t N>
1865HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
1866 const Vec128<double, N> b) {
1867 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1868}
1869
1870#endif // HWY_TARGET <= HWY_AVX3
1871
1872// ------------------------------ Reversed comparisons
1873
1874template <typename T, size_t N>
1875HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1876 return b > a;
1877}
1878
1879template <typename T, size_t N>
1880HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
1881 return b >= a;
1882}
1883
1884// ------------------------------ FirstN (Iota, Lt)
1885
1886template <typename T, size_t N, HWY_IF_LE128(T, N)>
1887HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
1888#if HWY_TARGET <= HWY_AVX3
1889 (void)d;
1890 const uint64_t all = (1ull << N) - 1;
1891 // BZHI only looks at the lower 8 bits of num!
1892 const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1893 return Mask128<T, N>::FromBits(bits);
1894#else
1895 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1896 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1897#endif
1898}
1899
1900template <class D>
1901using MFromD = decltype(FirstN(D(), 0));
1902
1903// ================================================== MEMORY (1)
1904
1905// Clang static analysis claims the memory immediately after a partial vector
1906// store is uninitialized, and also flags the input to partial loads (at least
1907// for loadl_pd) as "garbage". This is a false alarm because msan does not
1908// raise errors. We work around this by using CopyBytes instead of intrinsics,
1909// but only for the analyzer to avoid potentially bad code generation.
1910// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1911#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1912#if defined(__clang_analyzer__) || \
1913 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1914#define HWY_SAFE_PARTIAL_LOAD_STORE 1
1915#else
1916#define HWY_SAFE_PARTIAL_LOAD_STORE 0
1917#endif
1918#endif // HWY_SAFE_PARTIAL_LOAD_STORE
1919
1920// ------------------------------ Load
1921
1922template <typename T>
1923HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1924 return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1925}
1927 const float* HWY_RESTRICT aligned) {
1928 return Vec128<float>{_mm_load_ps(aligned)};
1929}
1931 const double* HWY_RESTRICT aligned) {
1932 return Vec128<double>{_mm_load_pd(aligned)};
1933}
1934
1935template <typename T>
1937 return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1938}
1939HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
1940 const float* HWY_RESTRICT p) {
1941 return Vec128<float>{_mm_loadu_ps(p)};
1942}
1944 const double* HWY_RESTRICT p) {
1945 return Vec128<double>{_mm_loadu_pd(p)};
1946}
1947
1948template <typename T>
1950#if HWY_SAFE_PARTIAL_LOAD_STORE
1951 __m128i v = _mm_setzero_si128();
1952 CopyBytes<8>(p, &v); // not same size
1953 return Vec64<T>{v};
1954#else
1955 return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
1956#endif
1957}
1958
1960 const float* HWY_RESTRICT p) {
1961#if HWY_SAFE_PARTIAL_LOAD_STORE
1962 __m128 v = _mm_setzero_ps();
1963 CopyBytes<8>(p, &v); // not same size
1964 return Vec128<float, 2>{v};
1965#else
1966 const __m128 hi = _mm_setzero_ps();
1967 return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
1968#endif
1969}
1970
1972 const double* HWY_RESTRICT p) {
1973#if HWY_SAFE_PARTIAL_LOAD_STORE
1974 __m128d v = _mm_setzero_pd();
1975 CopyBytes<8>(p, &v); // not same size
1976 return Vec64<double>{v};
1977#else
1978 return Vec64<double>{_mm_load_sd(p)};
1979#endif
1980}
1981
1983 const float* HWY_RESTRICT p) {
1984#if HWY_SAFE_PARTIAL_LOAD_STORE
1985 __m128 v = _mm_setzero_ps();
1986 CopyBytes<4>(p, &v); // not same size
1987 return Vec128<float, 1>{v};
1988#else
1989 return Vec128<float, 1>{_mm_load_ss(p)};
1990#endif
1991}
1992
1993// Any <= 32 bit except <float, 1>
1994template <typename T, size_t N, HWY_IF_LE32(T, N)>
1995HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
1996 constexpr size_t kSize = sizeof(T) * N;
1997#if HWY_SAFE_PARTIAL_LOAD_STORE
1998 __m128 v = _mm_setzero_ps();
1999 CopyBytes<kSize>(p, &v); // not same size
2000 return Vec128<T, N>{v};
2001#else
2002 int32_t bits = 0;
2003 CopyBytes<kSize>(p, &bits); // not same size
2004 return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
2005#endif
2006}
2007
2008// For < 128 bit, LoadU == Load.
2009template <typename T, size_t N, HWY_IF_LE64(T, N)>
2010HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
2011 return Load(d, p);
2012}
2013
2014// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
2015template <typename T, size_t N, HWY_IF_LE128(T, N)>
2016HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
2017 return LoadU(d, p);
2018}
2019
2020// Returns a vector with lane i=[0, N) set to "first" + i.
2021template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
2022HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
2023 HWY_ALIGN T lanes[16 / sizeof(T)];
2024 for (size_t i = 0; i < 16 / sizeof(T); ++i) {
2025 lanes[i] =
2026 AddWithWraparound(hwy::IsFloatTag<T>(), static_cast<T>(first), i);
2027 }
2028 return Load(d, lanes);
2029}
2030
2031// ------------------------------ MaskedLoad
2032
2033#if HWY_TARGET <= HWY_AVX3
2034
2035template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2037 const T* HWY_RESTRICT p) {
2038 return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, p)};
2039}
2040
2041template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2042HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2043 const T* HWY_RESTRICT p) {
2044 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
2045}
2046
2047template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2048HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2049 const T* HWY_RESTRICT p) {
2050 return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
2051}
2052
2053template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2054HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2055 const T* HWY_RESTRICT p) {
2056 return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
2057}
2058
2059template <size_t N>
2061 Simd<float, N, 0> /* tag */,
2062 const float* HWY_RESTRICT p) {
2063 return Vec128<float, N>{_mm_maskz_loadu_ps(m.raw, p)};
2064}
2065
2066template <size_t N>
2068 Simd<double, N, 0> /* tag */,
2069 const double* HWY_RESTRICT p) {
2070 return Vec128<double, N>{_mm_maskz_loadu_pd(m.raw, p)};
2071}
2072
2073#elif HWY_TARGET == HWY_AVX2
2074
2075template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2076HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2077 const T* HWY_RESTRICT p) {
2078 auto p_p = reinterpret_cast<const int*>(p); // NOLINT
2079 return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
2080}
2081
2082template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2083HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
2084 const T* HWY_RESTRICT p) {
2085 auto p_p = reinterpret_cast<const long long*>(p); // NOLINT
2086 return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
2087}
2088
2089template <size_t N>
2090HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N, 0> d,
2091 const float* HWY_RESTRICT p) {
2092 const Vec128<int32_t, N> mi =
2093 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2094 return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
2095}
2096
2097template <size_t N>
2098HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N, 0> d,
2099 const double* HWY_RESTRICT p) {
2100 const Vec128<int64_t, N> mi =
2101 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2102 return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
2103}
2104
2105// There is no maskload_epi8/16, so blend instead.
2106template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 6)> // 1 or 2 bytes
2107HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2108 const T* HWY_RESTRICT p) {
2109 return IfThenElseZero(m, Load(d, p));
2110}
2111
2112#else // <= SSE4
2113
2114// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2115template <typename T, size_t N>
2116HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
2117 const T* HWY_RESTRICT p) {
2118 return IfThenElseZero(m, Load(d, p));
2119}
2120
2121#endif
2122
2123// ------------------------------ Store
2124
2125template <typename T>
2126HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
2127 _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
2128}
2130 float* HWY_RESTRICT aligned) {
2131 _mm_store_ps(aligned, v.raw);
2132}
2134 double* HWY_RESTRICT aligned) {
2135 _mm_store_pd(aligned, v.raw);
2136}
2137
2138template <typename T>
2140 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
2141}
2142HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
2143 float* HWY_RESTRICT p) {
2144 _mm_storeu_ps(p, v.raw);
2145}
2147 double* HWY_RESTRICT p) {
2148 _mm_storeu_pd(p, v.raw);
2149}
2150
2151template <typename T>
2153#if HWY_SAFE_PARTIAL_LOAD_STORE
2154 CopyBytes<8>(&v, p); // not same size
2155#else
2156 _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
2157#endif
2158}
2160 float* HWY_RESTRICT p) {
2161#if HWY_SAFE_PARTIAL_LOAD_STORE
2162 CopyBytes<8>(&v, p); // not same size
2163#else
2164 _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
2165#endif
2166}
2168 double* HWY_RESTRICT p) {
2169#if HWY_SAFE_PARTIAL_LOAD_STORE
2170 CopyBytes<8>(&v, p); // not same size
2171#else
2172 _mm_storel_pd(p, v.raw);
2173#endif
2174}
2175
2176// Any <= 32 bit except <float, 1>
2177template <typename T, size_t N, HWY_IF_LE32(T, N)>
2178HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2179 CopyBytes<sizeof(T) * N>(&v, p); // not same size
2180}
2182 float* HWY_RESTRICT p) {
2183#if HWY_SAFE_PARTIAL_LOAD_STORE
2184 CopyBytes<4>(&v, p); // not same size
2185#else
2186 _mm_store_ss(p, v.raw);
2187#endif
2188}
2189
2190// For < 128 bit, StoreU == Store.
2191template <typename T, size_t N, HWY_IF_LE64(T, N)>
2193 Store(v, d, p);
2194}
2195
2196// ------------------------------ BlendedStore
2197
2198namespace detail {
2199
2200// There is no maskload_epi8/16 with which we could safely implement
2201// BlendedStore. Manual blending is also unsafe because loading a full vector
2202// that crosses the array end causes asan faults. Resort to scalar code; the
2203// caller should instead use memcpy, assuming m is FirstN(d, n).
2204template <typename T, size_t N>
2206 T* HWY_RESTRICT p) {
2207 const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
2208 using TI = TFromD<decltype(di)>;
2209 alignas(16) TI buf[N];
2210 alignas(16) TI mask[N];
2211 Store(BitCast(di, v), di, buf);
2212 Store(BitCast(di, VecFromMask(d, m)), di, mask);
2213 for (size_t i = 0; i < N; ++i) {
2214 if (mask[i]) {
2215 CopySameSize(buf + i, p + i);
2216 }
2217 }
2218}
2219} // namespace detail
2220
2221#if HWY_TARGET <= HWY_AVX3
2222
2223template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2225 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2226 _mm_mask_storeu_epi8(p, m.raw, v.raw);
2227}
2228template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2229HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2230 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2231 _mm_mask_storeu_epi16(p, m.raw, v.raw);
2232}
2233
2234template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2235HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2236 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2237 auto pi = reinterpret_cast<int*>(p); // NOLINT
2238 _mm_mask_storeu_epi32(pi, m.raw, v.raw);
2239}
2240
2241template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2242HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2243 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2244 auto pi = reinterpret_cast<long long*>(p); // NOLINT
2245 _mm_mask_storeu_epi64(pi, m.raw, v.raw);
2246}
2247
2248template <size_t N>
2250 Simd<float, N, 0>, float* HWY_RESTRICT p) {
2251 _mm_mask_storeu_ps(p, m.raw, v.raw);
2252}
2253
2254template <size_t N>
2256 Simd<double, N, 0>, double* HWY_RESTRICT p) {
2257 _mm_mask_storeu_pd(p, m.raw, v.raw);
2258}
2259
2260#elif HWY_TARGET == HWY_AVX2
2261
2262template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 6)> // 1 or 2 bytes
2263HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2264 T* HWY_RESTRICT p) {
2266}
2267
2268template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2269HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2270 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2271 // For partial vectors, avoid writing other lanes by zeroing their mask.
2272 if (N < 4) {
2273 const Full128<T> df;
2274 const Mask128<T> mf{m.raw};
2275 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2276 }
2277
2278 auto pi = reinterpret_cast<int*>(p); // NOLINT
2279 _mm_maskstore_epi32(pi, m.raw, v.raw);
2280}
2281
2282template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2283HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
2284 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
2285 // For partial vectors, avoid writing other lanes by zeroing their mask.
2286 if (N < 2) {
2287 const Full128<T> df;
2288 const Mask128<T> mf{m.raw};
2289 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2290 }
2291
2292 auto pi = reinterpret_cast<long long*>(p); // NOLINT
2293 _mm_maskstore_epi64(pi, m.raw, v.raw);
2294}
2295
2296template <size_t N>
2297HWY_API void BlendedStore(Vec128<float, N> v, Mask128<float, N> m,
2298 Simd<float, N, 0> d, float* HWY_RESTRICT p) {
2299 using T = float;
2300 // For partial vectors, avoid writing other lanes by zeroing their mask.
2301 if (N < 4) {
2302 const Full128<T> df;
2303 const Mask128<T> mf{m.raw};
2304 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2305 }
2306
2307 const Vec128<MakeSigned<T>, N> mi =
2308 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2309 _mm_maskstore_ps(p, mi.raw, v.raw);
2310}
2311
2312template <size_t N>
2313HWY_API void BlendedStore(Vec128<double, N> v, Mask128<double, N> m,
2314 Simd<double, N, 0> d, double* HWY_RESTRICT p) {
2315 using T = double;
2316 // For partial vectors, avoid writing other lanes by zeroing their mask.
2317 if (N < 2) {
2318 const Full128<T> df;
2319 const Mask128<T> mf{m.raw};
2320 m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
2321 }
2322
2323 const Vec128<MakeSigned<T>, N> mi =
2324 BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2325 _mm_maskstore_pd(p, mi.raw, v.raw);
2326}
2327
2328#else // <= SSE4
2329
2330template <typename T, size_t N>
2331HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2332 T* HWY_RESTRICT p) {
2333 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2335}
2336
2337#endif // SSE4
2338
2339// ================================================== ARITHMETIC
2340
2341// ------------------------------ Addition
2342
2343// Unsigned
2344template <size_t N>
2345HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
2346 const Vec128<uint8_t, N> b) {
2347 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2348}
2349template <size_t N>
2350HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
2351 const Vec128<uint16_t, N> b) {
2352 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2353}
2354template <size_t N>
2355HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
2356 const Vec128<uint32_t, N> b) {
2357 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2358}
2359template <size_t N>
2360HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
2361 const Vec128<uint64_t, N> b) {
2362 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2363}
2364
2365// Signed
2366template <size_t N>
2367HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
2368 const Vec128<int8_t, N> b) {
2369 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2370}
2371template <size_t N>
2372HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
2373 const Vec128<int16_t, N> b) {
2374 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2375}
2376template <size_t N>
2377HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
2378 const Vec128<int32_t, N> b) {
2379 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2380}
2381template <size_t N>
2382HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
2383 const Vec128<int64_t, N> b) {
2384 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2385}
2386
2387// Float
2388template <size_t N>
2389HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
2390 const Vec128<float, N> b) {
2391 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2392}
2393template <size_t N>
2395 const Vec128<double, N> b) {
2396 return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
2397}
2398
2399// ------------------------------ Subtraction
2400
2401// Unsigned
2402template <size_t N>
2403HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
2404 const Vec128<uint8_t, N> b) {
2405 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2406}
2407template <size_t N>
2408HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
2409 Vec128<uint16_t, N> b) {
2410 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2411}
2412template <size_t N>
2413HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
2414 const Vec128<uint32_t, N> b) {
2415 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2416}
2417template <size_t N>
2418HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
2419 const Vec128<uint64_t, N> b) {
2420 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2421}
2422
2423// Signed
2424template <size_t N>
2425HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
2426 const Vec128<int8_t, N> b) {
2427 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2428}
2429template <size_t N>
2430HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
2431 const Vec128<int16_t, N> b) {
2432 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2433}
2434template <size_t N>
2435HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
2436 const Vec128<int32_t, N> b) {
2437 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2438}
2439template <size_t N>
2440HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
2441 const Vec128<int64_t, N> b) {
2442 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2443}
2444
2445// Float
2446template <size_t N>
2447HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
2448 const Vec128<float, N> b) {
2449 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2450}
2451template <size_t N>
2453 const Vec128<double, N> b) {
2454 return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
2455}
2456
2457// ------------------------------ SumsOf8
2458template <size_t N>
2459HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
2460 return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
2461}
2462
2463// ------------------------------ SaturatedAdd
2464
2465// Returns a + b clamped to the destination range.
2466
2467// Unsigned
2468template <size_t N>
2469HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
2470 const Vec128<uint8_t, N> b) {
2471 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2472}
2473template <size_t N>
2474HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
2475 const Vec128<uint16_t, N> b) {
2476 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2477}
2478
2479// Signed
2480template <size_t N>
2481HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
2482 const Vec128<int8_t, N> b) {
2483 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2484}
2485template <size_t N>
2486HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
2487 const Vec128<int16_t, N> b) {
2488 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2489}
2490
2491// ------------------------------ SaturatedSub
2492
2493// Returns a - b clamped to the destination range.
2494
2495// Unsigned
2496template <size_t N>
2497HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
2498 const Vec128<uint8_t, N> b) {
2499 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2500}
2501template <size_t N>
2502HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
2503 const Vec128<uint16_t, N> b) {
2504 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2505}
2506
2507// Signed
2508template <size_t N>
2509HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
2510 const Vec128<int8_t, N> b) {
2511 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2512}
2513template <size_t N>
2514HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
2515 const Vec128<int16_t, N> b) {
2516 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2517}
2518
2519// ------------------------------ AverageRound
2520
2521// Returns (a + b + 1) / 2
2522
2523// Unsigned
2524template <size_t N>
2525HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
2526 const Vec128<uint8_t, N> b) {
2527 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2528}
2529template <size_t N>
2530HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
2531 const Vec128<uint16_t, N> b) {
2532 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2533}
2534
2535// ------------------------------ Integer multiplication
2536
2537template <size_t N>
2538HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
2539 const Vec128<uint16_t, N> b) {
2540 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2541}
2542template <size_t N>
2543HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
2544 const Vec128<int16_t, N> b) {
2545 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2546}
2547
2548// Returns the upper 16 bits of a * b in each lane.
2549template <size_t N>
2550HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
2551 const Vec128<uint16_t, N> b) {
2552 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2553}
2554template <size_t N>
2555HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
2556 const Vec128<int16_t, N> b) {
2557 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2558}
2559
2560template <size_t N>
2561HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
2562 const Vec128<int16_t, N> b) {
2563 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2564}
2565
2566// Multiplies even lanes (0, 2 ..) and places the double-wide result into
2567// even and the upper half into its odd neighbor lane.
2568template <size_t N>
2569HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
2570 const Vec128<uint32_t, N> b) {
2571 return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2572}
2573
2574#if HWY_TARGET == HWY_SSSE3
2575
2576template <size_t N, HWY_IF_LE64(int32_t, N)> // N=1 or 2
2577HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2578 const Vec128<int32_t, N> b) {
2579 return Set(Simd<int64_t, (N + 1) / 2, 0>(),
2580 static_cast<int64_t>(GetLane(a)) * GetLane(b));
2581}
2582HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
2583 const Vec128<int32_t> b) {
2584 alignas(16) int32_t a_lanes[4];
2585 alignas(16) int32_t b_lanes[4];
2586 const Full128<int32_t> di32;
2587 Store(a, di32, a_lanes);
2588 Store(b, di32, b_lanes);
2589 alignas(16) int64_t mul[2];
2590 mul[0] = static_cast<int64_t>(a_lanes[0]) * b_lanes[0];
2591 mul[1] = static_cast<int64_t>(a_lanes[2]) * b_lanes[2];
2592 return Load(Full128<int64_t>(), mul);
2593}
2594
2595#else // HWY_TARGET == HWY_SSSE3
2596
2597template <size_t N>
2598HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2599 const Vec128<int32_t, N> b) {
2600 return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2601}
2602
2603#endif // HWY_TARGET == HWY_SSSE3
2604
2605template <size_t N>
2606HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
2607 const Vec128<uint32_t, N> b) {
2608#if HWY_TARGET == HWY_SSSE3
2609 // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
2610 // 64-bit right shift would also work but also needs port 5, so no benefit.
2611 // Notation: x=don't care, z=0.
2612 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2613 const auto mullo_x2x0 = MulEven(a, b);
2614 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2615 const auto mullo_x3x1 =
2616 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2617 // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
2618 // the latter requires one more instruction or a constant.
2619 const __m128i mul_20 =
2620 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2621 const __m128i mul_31 =
2622 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2623 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2624#else
2625 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2626#endif
2627}
2628
2629template <size_t N>
2630HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
2631 const Vec128<int32_t, N> b) {
2632 // Same as unsigned; avoid duplicating the SSSE3 code.
2633 const DFromV<decltype(a)> d;
2634 const RebindToUnsigned<decltype(d)> du;
2635 return BitCast(d, BitCast(du, a) * BitCast(du, b));
2636}
2637
2638// ------------------------------ RotateRight (ShiftRight, Or)
2639
2640template <int kBits, size_t N>
2641HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
2642 static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
2643#if HWY_TARGET <= HWY_AVX3
2644 return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
2645#else
2646 if (kBits == 0) return v;
2647 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
2648#endif
2649}
2650
2651template <int kBits, size_t N>
2652HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
2653 static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
2654#if HWY_TARGET <= HWY_AVX3
2655 return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
2656#else
2657 if (kBits == 0) return v;
2658 return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
2659#endif
2660}
2661
2662// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2663
2664template <size_t N>
2665HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
2666 const DFromV<decltype(v)> d;
2667 return VecFromMask(v < Zero(d));
2668}
2669
2670template <size_t N>
2672 return ShiftRight<15>(v);
2673}
2674
2675template <size_t N>
2677 return ShiftRight<31>(v);
2678}
2679
2680template <size_t N>
2682 const DFromV<decltype(v)> d;
2683#if HWY_TARGET <= HWY_AVX3
2684 (void)d;
2685 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
2686#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2687 return VecFromMask(v < Zero(d));
2688#else
2689 // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
2690 // avoids generating a zero.
2691 const RepartitionToNarrow<decltype(d)> d32;
2692 const auto sign = ShiftRight<31>(BitCast(d32, v));
2693 return Vec128<int64_t, N>{
2694 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2695#endif
2696}
2697
2698template <size_t N>
2699HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
2700#if HWY_TARGET <= HWY_AVX3
2701 return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2702#else
2703 const auto zero = Zero(DFromV<decltype(v)>());
2704 return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2705#endif
2706}
2707
2708template <int kBits, size_t N>
2709HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
2710#if HWY_TARGET <= HWY_AVX3
2711 return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
2712#else
2713 const DFromV<decltype(v)> di;
2714 const RebindToUnsigned<decltype(di)> du;
2715 const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2716 const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2717 return right | sign;
2718#endif
2719}
2720
2721// ------------------------------ ZeroIfNegative (BroadcastSignBit)
2722template <typename T, size_t N>
2723HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2724 static_assert(IsFloat<T>(), "Only works for float");
2725 const DFromV<decltype(v)> d;
2726#if HWY_TARGET == HWY_SSSE3
2727 const RebindToSigned<decltype(d)> di;
2728 const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2729#else
2730 const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
2731#endif
2732 return IfThenElse(mask, Zero(d), v);
2733}
2734
2735// ------------------------------ IfNegativeThenElse
2736template <size_t N>
2738 const Vec128<int8_t, N> yes,
2739 const Vec128<int8_t, N> no) {
2740 // int8: IfThenElse only looks at the MSB.
2741 return IfThenElse(MaskFromVec(v), yes, no);
2742}
2743
2744template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2746 Vec128<T, N> no) {
2747 static_assert(IsSigned<T>(), "Only works for signed/float");
2748 const DFromV<decltype(v)> d;
2749 const RebindToSigned<decltype(d)> di;
2750
2751 // 16-bit: no native blendv, so copy sign to lower byte's MSB.
2752 v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
2753 return IfThenElse(MaskFromVec(v), yes, no);
2754}
2755
2756template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2757HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
2758 Vec128<T, N> no) {
2759 static_assert(IsSigned<T>(), "Only works for signed/float");
2760 const DFromV<decltype(v)> d;
2761 const RebindToFloat<decltype(d)> df;
2762
2763 // 32/64-bit: use float IfThenElse, which only looks at the MSB.
2764 return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes),
2765 BitCast(df, no)));
2766}
2767
2768// ------------------------------ ShiftLeftSame
2769
2770template <size_t N>
2771HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
2772 const int bits) {
2773 return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2774}
2775template <size_t N>
2776HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
2777 const int bits) {
2778 return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2779}
2780template <size_t N>
2781HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
2782 const int bits) {
2783 return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2784}
2785
2786template <size_t N>
2787HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
2788 const int bits) {
2789 return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2790}
2791
2792template <size_t N>
2793HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
2794 const int bits) {
2795 return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2796}
2797
2798template <size_t N>
2799HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
2800 const int bits) {
2801 return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2802}
2803
2804template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2805HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
2806 const DFromV<decltype(v)> d8;
2807 // Use raw instead of BitCast to support N=1.
2808 const Vec128<T, N> shifted{
2809 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
2810 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2811}
2812
2813// ------------------------------ ShiftRightSame (BroadcastSignBit)
2814
2815template <size_t N>
2816HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
2817 const int bits) {
2818 return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2819}
2820template <size_t N>
2821HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
2822 const int bits) {
2823 return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2824}
2825template <size_t N>
2826HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
2827 const int bits) {
2828 return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2829}
2830
2831template <size_t N>
2832HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
2833 const int bits) {
2834 const DFromV<decltype(v)> d8;
2835 // Use raw instead of BitCast to support N=1.
2836 const Vec128<uint8_t, N> shifted{
2837 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
2838 return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2839}
2840
2841template <size_t N>
2842HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
2843 const int bits) {
2844 return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2845}
2846
2847template <size_t N>
2848HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
2849 const int bits) {
2850 return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2851}
2852template <size_t N>
2853HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
2854 const int bits) {
2855#if HWY_TARGET <= HWY_AVX3
2856 return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2857#else
2858 const DFromV<decltype(v)> di;
2859 const RebindToUnsigned<decltype(di)> du;
2860 const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2861 const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2862 return right | sign;
2863#endif
2864}
2865
2866template <size_t N>
2867HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
2868 const DFromV<decltype(v)> di;
2869 const RebindToUnsigned<decltype(di)> du;
2870 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2871 const auto shifted_sign =
2872 BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2873 return (shifted ^ shifted_sign) - shifted_sign;
2874}
2875
2876// ------------------------------ Floating-point mul / div
2877
2878template <size_t N>
2879HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
2880 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2881}
2883 const Vec128<float, 1> b) {
2884 return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
2885}
2886template <size_t N>
2888 const Vec128<double, N> b) {
2889 return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
2890}
2892 return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
2893}
2894
2895template <size_t N>
2896HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
2897 const Vec128<float, N> b) {
2898 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2899}
2901 const Vec128<float, 1> b) {
2902 return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
2903}
2904template <size_t N>
2906 const Vec128<double, N> b) {
2907 return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
2908}
2910 return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
2911}
2912
2913// Approximate reciprocal
2914template <size_t N>
2915HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
2916 return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2917}
2921
2922// Absolute value of difference.
2923template <size_t N>
2924HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
2925 const Vec128<float, N> b) {
2926 return Abs(a - b);
2927}
2928
2929// ------------------------------ Floating-point multiply-add variants
2930
2931// Returns mul * x + add
2932template <size_t N>
2933HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
2934 const Vec128<float, N> x,
2935 const Vec128<float, N> add) {
2936#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2937 return mul * x + add;
2938#else
2939 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2940#endif
2941}
2942template <size_t N>
2944 const Vec128<double, N> x,
2945 const Vec128<double, N> add) {
2946#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2947 return mul * x + add;
2948#else
2949 return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
2950#endif
2951}
2952
2953// Returns add - mul * x
2954template <size_t N>
2955HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
2956 const Vec128<float, N> x,
2957 const Vec128<float, N> add) {
2958#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2959 return add - mul * x;
2960#else
2961 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2962#endif
2963}
2964template <size_t N>
2966 const Vec128<double, N> x,
2967 const Vec128<double, N> add) {
2968#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2969 return add - mul * x;
2970#else
2971 return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
2972#endif
2973}
2974
2975// Returns mul * x - sub
2976template <size_t N>
2977HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
2978 const Vec128<float, N> x,
2979 const Vec128<float, N> sub) {
2980#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2981 return mul * x - sub;
2982#else
2983 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2984#endif
2985}
2986template <size_t N>
2988 const Vec128<double, N> x,
2989 const Vec128<double, N> sub) {
2990#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2991 return mul * x - sub;
2992#else
2993 return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
2994#endif
2995}
2996
2997// Returns -mul * x - sub
2998template <size_t N>
2999HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
3000 const Vec128<float, N> x,
3001 const Vec128<float, N> sub) {
3002#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3003 return Neg(mul) * x - sub;
3004#else
3005 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
3006#endif
3007}
3008template <size_t N>
3010 const Vec128<double, N> x,
3011 const Vec128<double, N> sub) {
3012#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3013 return Neg(mul) * x - sub;
3014#else
3015 return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
3016#endif
3017}
3018
3019// ------------------------------ Floating-point square root
3020
3021// Full precision square root
3022template <size_t N>
3023HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
3024 return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
3025}
3027 return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
3028}
3029template <size_t N>
3031 return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
3032}
3034 return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
3035}
3036
3037// Approximate reciprocal square root
3038template <size_t N>
3039HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
3040 return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
3041}
3045
3046// ------------------------------ Min (Gt, IfThenElse)
3047
3048namespace detail {
3049
3050template <typename T, size_t N>
3052 const Vec128<T, N> b) {
3053 const DFromV<decltype(a)> d;
3054 const RebindToUnsigned<decltype(d)> du;
3055 const RebindToSigned<decltype(d)> di;
3056 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
3057 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
3058 return IfThenElse(gt, b, a);
3059}
3060
3061} // namespace detail
3062
3063// Unsigned
3064template <size_t N>
3065HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
3066 const Vec128<uint8_t, N> b) {
3067 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
3068}
3069template <size_t N>
3070HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
3071 const Vec128<uint16_t, N> b) {
3072#if HWY_TARGET == HWY_SSSE3
3073 return detail::MinU(a, b);
3074#else
3075 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
3076#endif
3077}
3078template <size_t N>
3079HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
3080 const Vec128<uint32_t, N> b) {
3081#if HWY_TARGET == HWY_SSSE3
3082 return detail::MinU(a, b);
3083#else
3084 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
3085#endif
3086}
3087template <size_t N>
3088HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
3089 const Vec128<uint64_t, N> b) {
3090#if HWY_TARGET <= HWY_AVX3
3091 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
3092#else
3093 return detail::MinU(a, b);
3094#endif
3095}
3096
3097// Signed
3098template <size_t N>
3099HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
3100 const Vec128<int8_t, N> b) {
3101#if HWY_TARGET == HWY_SSSE3
3102 return IfThenElse(a < b, a, b);
3103#else
3104 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
3105#endif
3106}
3107template <size_t N>
3108HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
3109 const Vec128<int16_t, N> b) {
3110 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
3111}
3112template <size_t N>
3113HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
3114 const Vec128<int32_t, N> b) {
3115#if HWY_TARGET == HWY_SSSE3
3116 return IfThenElse(a < b, a, b);
3117#else
3118 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
3119#endif
3120}
3121template <size_t N>
3122HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
3123 const Vec128<int64_t, N> b) {
3124#if HWY_TARGET <= HWY_AVX3
3125 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
3126#else
3127 return IfThenElse(a < b, a, b);
3128#endif
3129}
3130
3131// Float
3132template <size_t N>
3133HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
3134 const Vec128<float, N> b) {
3135 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
3136}
3137template <size_t N>
3139 const Vec128<double, N> b) {
3140 return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
3141}
3142
3143// ------------------------------ Max (Gt, IfThenElse)
3144
3145namespace detail {
3146template <typename T, size_t N>
3148 const Vec128<T, N> b) {
3149 const DFromV<decltype(a)> d;
3150 const RebindToUnsigned<decltype(d)> du;
3151 const RebindToSigned<decltype(d)> di;
3152 const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
3153 const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
3154 return IfThenElse(gt, a, b);
3155}
3156
3157} // namespace detail
3158
3159// Unsigned
3160template <size_t N>
3161HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
3162 const Vec128<uint8_t, N> b) {
3163 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
3164}
3165template <size_t N>
3166HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
3167 const Vec128<uint16_t, N> b) {
3168#if HWY_TARGET == HWY_SSSE3
3169 return detail::MaxU(a, b);
3170#else
3171 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
3172#endif
3173}
3174template <size_t N>
3175HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
3176 const Vec128<uint32_t, N> b) {
3177#if HWY_TARGET == HWY_SSSE3
3178 return detail::MaxU(a, b);
3179#else
3180 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3181#endif
3182}
3183template <size_t N>
3184HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
3185 const Vec128<uint64_t, N> b) {
3186#if HWY_TARGET <= HWY_AVX3
3187 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3188#else
3189 return detail::MaxU(a, b);
3190#endif
3191}
3192
3193// Signed
3194template <size_t N>
3195HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
3196 const Vec128<int8_t, N> b) {
3197#if HWY_TARGET == HWY_SSSE3
3198 return IfThenElse(a < b, b, a);
3199#else
3200 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3201#endif
3202}
3203template <size_t N>
3204HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
3205 const Vec128<int16_t, N> b) {
3206 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3207}
3208template <size_t N>
3209HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
3210 const Vec128<int32_t, N> b) {
3211#if HWY_TARGET == HWY_SSSE3
3212 return IfThenElse(a < b, b, a);
3213#else
3214 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3215#endif
3216}
3217template <size_t N>
3218HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
3219 const Vec128<int64_t, N> b) {
3220#if HWY_TARGET <= HWY_AVX3
3221 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3222#else
3223 return IfThenElse(a < b, b, a);
3224#endif
3225}
3226
3227// Float
3228template <size_t N>
3229HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
3230 const Vec128<float, N> b) {
3231 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3232}
3233template <size_t N>
3235 const Vec128<double, N> b) {
3236 return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
3237}
3238
3239// ================================================== MEMORY (2)
3240
3241// ------------------------------ Non-temporal stores
3242
3243// On clang6, we see incorrect code generated for _mm_stream_pi, so
3244// round even partial vectors up to 16 bytes.
3245template <typename T, size_t N>
3246HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
3247 T* HWY_RESTRICT aligned) {
3248 _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
3249}
3250template <size_t N>
3252 float* HWY_RESTRICT aligned) {
3253 _mm_stream_ps(aligned, v.raw);
3254}
3255template <size_t N>
3257 double* HWY_RESTRICT aligned) {
3258 _mm_stream_pd(aligned, v.raw);
3259}
3260
3261// ------------------------------ Scatter
3262
3263// Work around warnings in the intrinsic definitions (passing -1 as a mask).
3264HWY_DIAGNOSTICS(push)
3265HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3266
3267// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
3268using GatherIndex64 = long long int; // NOLINT(runtime/int)
3269static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
3270
3271#if HWY_TARGET <= HWY_AVX3
3272namespace detail {
3273
3274template <typename T, size_t N>
3276 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3277 const Vec128<int32_t, N> offset) {
3278 if (N == 4) {
3279 _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
3280 } else {
3281 const __mmask8 mask = (1u << N) - 1;
3282 _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
3283 }
3284}
3285template <typename T, size_t N>
3287 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3288 const Vec128<int32_t, N> index) {
3289 if (N == 4) {
3290 _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
3291 } else {
3292 const __mmask8 mask = (1u << N) - 1;
3293 _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
3294 }
3295}
3296
3297template <typename T, size_t N>
3299 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3300 const Vec128<int64_t, N> offset) {
3301 if (N == 2) {
3302 _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
3303 } else {
3304 const __mmask8 mask = (1u << N) - 1;
3305 _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
3306 }
3307}
3308template <typename T, size_t N>
3310 Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3311 const Vec128<int64_t, N> index) {
3312 if (N == 2) {
3313 _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
3314 } else {
3315 const __mmask8 mask = (1u << N) - 1;
3316 _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
3317 }
3318}
3319
3320} // namespace detail
3321
3322template <typename T, size_t N, typename Offset>
3324 T* HWY_RESTRICT base,
3325 const Vec128<Offset, N> offset) {
3326 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3327 return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
3328}
3329template <typename T, size_t N, typename Index>
3330HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3331 const Vec128<Index, N> index) {
3332 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3333 return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
3334}
3335
3336template <size_t N>
3338 float* HWY_RESTRICT base,
3339 const Vec128<int32_t, N> offset) {
3340 if (N == 4) {
3341 _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
3342 } else {
3343 const __mmask8 mask = (1u << N) - 1;
3344 _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
3345 }
3346}
3347template <size_t N>
3349 float* HWY_RESTRICT base,
3350 const Vec128<int32_t, N> index) {
3351 if (N == 4) {
3352 _mm_i32scatter_ps(base, index.raw, v.raw, 4);
3353 } else {
3354 const __mmask8 mask = (1u << N) - 1;
3355 _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
3356 }
3357}
3358
3359template <size_t N>
3361 double* HWY_RESTRICT base,
3362 const Vec128<int64_t, N> offset) {
3363 if (N == 2) {
3364 _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
3365 } else {
3366 const __mmask8 mask = (1u << N) - 1;
3367 _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
3368 }
3369}
3370template <size_t N>
3372 double* HWY_RESTRICT base,
3373 const Vec128<int64_t, N> index) {
3374 if (N == 2) {
3375 _mm_i64scatter_pd(base, index.raw, v.raw, 8);
3376 } else {
3377 const __mmask8 mask = (1u << N) - 1;
3378 _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
3379 }
3380}
3381#else // HWY_TARGET <= HWY_AVX3
3382
3383template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
3384HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
3385 T* HWY_RESTRICT base,
3386 const Vec128<Offset, N> offset) {
3387 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3388
3389 alignas(16) T lanes[N];
3390 Store(v, d, lanes);
3391
3392 alignas(16) Offset offset_lanes[N];
3393 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3394
3395 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
3396 for (size_t i = 0; i < N; ++i) {
3397 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3398 }
3399}
3400
3401template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
3402HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3403 const Vec128<Index, N> index) {
3404 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3405
3406 alignas(16) T lanes[N];
3407 Store(v, d, lanes);
3408
3409 alignas(16) Index index_lanes[N];
3410 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3411
3412 for (size_t i = 0; i < N; ++i) {
3413 base[index_lanes[i]] = lanes[i];
3414 }
3415}
3416
3417#endif
3418
3419// ------------------------------ Gather (Load/Store)
3420
3421#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3422
3423template <typename T, size_t N, typename Offset>
3424HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
3425 const T* HWY_RESTRICT base,
3426 const Vec128<Offset, N> offset) {
3427 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3428
3429 alignas(16) Offset offset_lanes[N];
3430 Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3431
3432 alignas(16) T lanes[N];
3433 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
3434 for (size_t i = 0; i < N; ++i) {
3435 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3436 }
3437 return Load(d, lanes);
3438}
3439
3440template <typename T, size_t N, typename Index>
3441HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
3442 const T* HWY_RESTRICT base,
3443 const Vec128<Index, N> index) {
3444 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3445
3446 alignas(16) Index index_lanes[N];
3447 Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3448
3449 alignas(16) T lanes[N];
3450 for (size_t i = 0; i < N; ++i) {
3451 lanes[i] = base[index_lanes[i]];
3452 }
3453 return Load(d, lanes);
3454}
3455
3456#else
3457
3458namespace detail {
3459
3460template <typename T, size_t N>
3461HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
3462 Simd<T, N, 0> /* d */,
3463 const T* HWY_RESTRICT base,
3464 const Vec128<int32_t, N> offset) {
3465 return Vec128<T, N>{_mm_i32gather_epi32(
3466 reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3467}
3468template <typename T, size_t N>
3469HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
3470 Simd<T, N, 0> /* d */,
3471 const T* HWY_RESTRICT base,
3472 const Vec128<int32_t, N> index) {
3473 return Vec128<T, N>{_mm_i32gather_epi32(
3474 reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3475}
3476
3477template <typename T, size_t N>
3478HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
3479 Simd<T, N, 0> /* d */,
3480 const T* HWY_RESTRICT base,
3481 const Vec128<int64_t, N> offset) {
3482 return Vec128<T, N>{_mm_i64gather_epi64(
3483 reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3484}
3485template <typename T, size_t N>
3486HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
3487 Simd<T, N, 0> /* d */,
3488 const T* HWY_RESTRICT base,
3489 const Vec128<int64_t, N> index) {
3490 return Vec128<T, N>{_mm_i64gather_epi64(
3491 reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3492}
3493
3494} // namespace detail
3495
3496template <typename T, size_t N, typename Offset>
3497HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3498 const Vec128<Offset, N> offset) {
3499 return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
3500}
3501template <typename T, size_t N, typename Index>
3502HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3503 const Vec128<Index, N> index) {
3504 return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
3505}
3506
3507template <size_t N>
3508HWY_API Vec128<float, N> GatherOffset(Simd<float, N, 0> /* tag */,
3509 const float* HWY_RESTRICT base,
3510 const Vec128<int32_t, N> offset) {
3511 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3512}
3513template <size_t N>
3514HWY_API Vec128<float, N> GatherIndex(Simd<float, N, 0> /* tag */,
3515 const float* HWY_RESTRICT base,
3516 const Vec128<int32_t, N> index) {
3517 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3518}
3519
3520template <size_t N>
3521HWY_API Vec128<double, N> GatherOffset(Simd<double, N, 0> /* tag */,
3522 const double* HWY_RESTRICT base,
3523 const Vec128<int64_t, N> offset) {
3524 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3525}
3526template <size_t N>
3527HWY_API Vec128<double, N> GatherIndex(Simd<double, N, 0> /* tag */,
3528 const double* HWY_RESTRICT base,
3529 const Vec128<int64_t, N> index) {
3530 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3531}
3532
3533#endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3534
3535HWY_DIAGNOSTICS(pop)
3536
3537// ================================================== SWIZZLE (2)
3538
3539// ------------------------------ LowerHalf
3540
3541// Returns upper/lower half of a vector.
3542template <typename T, size_t N>
3543HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
3544 Vec128<T, N> v) {
3545 return Vec128<T, N / 2>{v.raw};
3546}
3547
3548template <typename T, size_t N>
3549HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
3550 return LowerHalf(Simd<T, N / 2, 0>(), v);
3551}
3552
3553// ------------------------------ ShiftLeftBytes
3554
3555template <int kBytes, typename T, size_t N>
3556HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3557 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3558 return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3559}
3560
3561template <int kBytes, typename T, size_t N>
3562HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
3563 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
3564}
3565
3566// ------------------------------ ShiftLeftLanes
3567
3568template <int kLanes, typename T, size_t N>
3569HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3570 const Repartition<uint8_t, decltype(d)> d8;
3571 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3572}
3573
3574template <int kLanes, typename T, size_t N>
3575HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
3576 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
3577}
3578
3579// ------------------------------ ShiftRightBytes
3580template <int kBytes, typename T, size_t N>
3581HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3582 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3583 // For partial vectors, clear upper lanes so we shift in zeros.
3584 if (N != 16 / sizeof(T)) {
3585 const Vec128<T> vfull{v.raw};
3586 v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
3587 }
3588 return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3589}
3590
3591// ------------------------------ ShiftRightLanes
3592template <int kLanes, typename T, size_t N>
3593HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3594 const Repartition<uint8_t, decltype(d)> d8;
3595 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3596}
3597
3598// ------------------------------ UpperHalf (ShiftRightBytes)
3599
3600// Full input: copy hi into lo (smaller instruction encoding than shifts).
3601template <typename T>
3603 return Vec64<T>{_mm_unpackhi_epi64(v.raw, v.raw)};
3604}
3605HWY_API Vec128<float, 2> UpperHalf(Full64<float> /* tag */, Vec128<float> v) {
3606 return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3607}
3609 return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
3610}
3611
3612// Partial
3613template <typename T, size_t N, HWY_IF_LE64(T, N)>
3614HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3615 Vec128<T, N> v) {
3616 const DFromV<decltype(v)> d;
3617 const RebindToUnsigned<decltype(d)> du;
3618 const auto vu = BitCast(du, v);
3619 const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3620 return Vec128<T, (N + 1) / 2>{upper.raw};
3621}
3622
3623// ------------------------------ ExtractLane (UpperHalf)
3624
3625namespace detail {
3626
3627template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3628HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3629 static_assert(kLane < N, "Lane index out of bounds");
3630#if HWY_TARGET == HWY_SSSE3
3631 const int pair = _mm_extract_epi16(v.raw, kLane / 2);
3632 constexpr int kShift = kLane & 1 ? 8 : 0;
3633 return static_cast<T>((pair >> kShift) & 0xFF);
3634#else
3635 return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
3636#endif
3637}
3638
3639template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3640HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3641 static_assert(kLane < N, "Lane index out of bounds");
3642 return static_cast<T>(_mm_extract_epi16(v.raw, kLane) & 0xFFFF);
3643}
3644
3645template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3646HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3647 static_assert(kLane < N, "Lane index out of bounds");
3648#if HWY_TARGET == HWY_SSSE3
3649 alignas(16) T lanes[4];
3650 Store(v, DFromV<decltype(v)>(), lanes);
3651 return lanes[kLane];
3652#else
3653 return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
3654#endif
3655}
3656
3657template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3658HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
3659 static_assert(kLane < N, "Lane index out of bounds");
3660#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3661 alignas(16) T lanes[2];
3662 Store(v, DFromV<decltype(v)>(), lanes);
3663 return lanes[kLane];
3664#else
3665 return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
3666#endif
3667}
3668
3669template <size_t kLane, size_t N>
3670HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
3671 static_assert(kLane < N, "Lane index out of bounds");
3672#if HWY_TARGET == HWY_SSSE3
3673 alignas(16) float lanes[4];
3674 Store(v, DFromV<decltype(v)>(), lanes);
3675 return lanes[kLane];
3676#else
3677 // Bug in the intrinsic, returns int but should be float.
3678 const int32_t bits = _mm_extract_ps(v.raw, kLane);
3679 float ret;
3680 CopySameSize(&bits, &ret);
3681 return ret;
3682#endif
3683}
3684
3685// There is no extract_pd; two overloads because there is no UpperHalf for N=1.
3686template <size_t kLane>
3688 static_assert(kLane == 0, "Lane index out of bounds");
3689 return GetLane(v);
3690}
3691
3692template <size_t kLane>
3694 static_assert(kLane < 2, "Lane index out of bounds");
3695 const Half<DFromV<decltype(v)>> dh;
3696 return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
3697}
3698
3699} // namespace detail
3700
3701// Requires one overload per vector length because ExtractLane<3> may be a
3702// compile error if it calls _mm_extract_epi64.
3703template <typename T>
3704HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
3705 HWY_DASSERT(i == 0);
3706 (void)i;
3707 return GetLane(v);
3708}
3709
3710template <typename T>
3711HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
3712#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3713 if (__builtin_constant_p(i)) {
3714 switch (i) {
3715 case 0:
3716 return detail::ExtractLane<0>(v);
3717 case 1:
3718 return detail::ExtractLane<1>(v);
3719 }
3720 }
3721#endif
3722 alignas(16) T lanes[2];
3723 Store(v, DFromV<decltype(v)>(), lanes);
3724 return lanes[i];
3725}
3726
3727template <typename T>
3728HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
3729#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3730 if (__builtin_constant_p(i)) {
3731 switch (i) {
3732 case 0:
3733 return detail::ExtractLane<0>(v);
3734 case 1:
3735 return detail::ExtractLane<1>(v);
3736 case 2:
3737 return detail::ExtractLane<2>(v);
3738 case 3:
3739 return detail::ExtractLane<3>(v);
3740 }
3741 }
3742#endif
3743 alignas(16) T lanes[4];
3744 Store(v, DFromV<decltype(v)>(), lanes);
3745 return lanes[i];
3746}
3747
3748template <typename T>
3749HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
3750#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3751 if (__builtin_constant_p(i)) {
3752 switch (i) {
3753 case 0:
3754 return detail::ExtractLane<0>(v);
3755 case 1:
3756 return detail::ExtractLane<1>(v);
3757 case 2:
3758 return detail::ExtractLane<2>(v);
3759 case 3:
3760 return detail::ExtractLane<3>(v);
3761 case 4:
3762 return detail::ExtractLane<4>(v);
3763 case 5:
3764 return detail::ExtractLane<5>(v);
3765 case 6:
3766 return detail::ExtractLane<6>(v);
3767 case 7:
3768 return detail::ExtractLane<7>(v);
3769 }
3770 }
3771#endif
3772 alignas(16) T lanes[8];
3773 Store(v, DFromV<decltype(v)>(), lanes);
3774 return lanes[i];
3775}
3776
3777template <typename T>
3778HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
3779#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3780 if (__builtin_constant_p(i)) {
3781 switch (i) {
3782 case 0:
3783 return detail::ExtractLane<0>(v);
3784 case 1:
3785 return detail::ExtractLane<1>(v);
3786 case 2:
3787 return detail::ExtractLane<2>(v);
3788 case 3:
3789 return detail::ExtractLane<3>(v);
3790 case 4:
3791 return detail::ExtractLane<4>(v);
3792 case 5:
3793 return detail::ExtractLane<5>(v);
3794 case 6:
3795 return detail::ExtractLane<6>(v);
3796 case 7:
3797 return detail::ExtractLane<7>(v);
3798 case 8:
3799 return detail::ExtractLane<8>(v);
3800 case 9:
3801 return detail::ExtractLane<9>(v);
3802 case 10:
3803 return detail::ExtractLane<10>(v);
3804 case 11:
3805 return detail::ExtractLane<11>(v);
3806 case 12:
3807 return detail::ExtractLane<12>(v);
3808 case 13:
3809 return detail::ExtractLane<13>(v);
3810 case 14:
3811 return detail::ExtractLane<14>(v);
3812 case 15:
3813 return detail::ExtractLane<15>(v);
3814 }
3815 }
3816#endif
3817 alignas(16) T lanes[16];
3818 Store(v, DFromV<decltype(v)>(), lanes);
3819 return lanes[i];
3820}
3821
3822// ------------------------------ InsertLane (UpperHalf)
3823
3824namespace detail {
3825
3826template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
3827HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3828 static_assert(kLane < N, "Lane index out of bounds");
3829#if HWY_TARGET == HWY_SSSE3
3830 const DFromV<decltype(v)> d;
3831 alignas(16) T lanes[16];
3832 Store(v, d, lanes);
3833 lanes[kLane] = t;
3834 return Load(d, lanes);
3835#else
3836 return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
3837#endif
3838}
3839
3840template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3841HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3842 static_assert(kLane < N, "Lane index out of bounds");
3843 return Vec128<T, N>{_mm_insert_epi16(v.raw, t, kLane)};
3844}
3845
3846template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3847HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3848 static_assert(kLane < N, "Lane index out of bounds");
3849#if HWY_TARGET == HWY_SSSE3
3850 alignas(16) T lanes[4];
3851 const DFromV<decltype(v)> d;
3852 Store(v, d, lanes);
3853 lanes[kLane] = t;
3854 return Load(d, lanes);
3855#else
3856 MakeSigned<T> ti;
3857 CopySameSize(&t, &ti); // don't just cast because T might be float.
3858 return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
3859#endif
3860}
3861
3862template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3863HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
3864 static_assert(kLane < N, "Lane index out of bounds");
3865#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
3866 const DFromV<decltype(v)> d;
3867 alignas(16) T lanes[2];
3868 Store(v, d, lanes);
3869 lanes[kLane] = t;
3870 return Load(d, lanes);
3871#else
3872 MakeSigned<T> ti;
3873 CopySameSize(&t, &ti); // don't just cast because T might be float.
3874 return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
3875#endif
3876}
3877
3878template <size_t kLane, size_t N>
3879HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
3880 static_assert(kLane < N, "Lane index out of bounds");
3881#if HWY_TARGET == HWY_SSSE3
3882 const DFromV<decltype(v)> d;
3883 alignas(16) float lanes[4];
3884 Store(v, d, lanes);
3885 lanes[kLane] = t;
3886 return Load(d, lanes);
3887#else
3888 return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
3889#endif
3890}
3891
3892// There is no insert_pd; two overloads because there is no UpperHalf for N=1.
3893template <size_t kLane>
3895 static_assert(kLane == 0, "Lane index out of bounds");
3896 return Set(DFromV<decltype(v)>(), t);
3897}
3898
3899template <size_t kLane>
3901 static_assert(kLane < 2, "Lane index out of bounds");
3902 const DFromV<decltype(v)> d;
3903 const Vec128<double> vt = Set(d, t);
3904 if (kLane == 0) {
3905 return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
3906 }
3907 return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
3908}
3909
3910} // namespace detail
3911
3912// Requires one overload per vector length because InsertLane<3> may be a
3913// compile error if it calls _mm_insert_epi64.
3914
3915template <typename T>
3916HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
3917 HWY_DASSERT(i == 0);
3918 (void)i;
3919 return Set(DFromV<decltype(v)>(), t);
3920}
3921
3922template <typename T>
3923HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
3924#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3925 if (__builtin_constant_p(i)) {
3926 switch (i) {
3927 case 0:
3928 return detail::InsertLane<0>(v, t);
3929 case 1:
3930 return detail::InsertLane<1>(v, t);
3931 }
3932 }
3933#endif
3934 const DFromV<decltype(v)> d;
3935 alignas(16) T lanes[2];
3936 Store(v, d, lanes);
3937 lanes[i] = t;
3938 return Load(d, lanes);
3939}
3940
3941template <typename T>
3942HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
3943#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3944 if (__builtin_constant_p(i)) {
3945 switch (i) {
3946 case 0:
3947 return detail::InsertLane<0>(v, t);
3948 case 1:
3949 return detail::InsertLane<1>(v, t);
3950 case 2:
3951 return detail::InsertLane<2>(v, t);
3952 case 3:
3953 return detail::InsertLane<3>(v, t);
3954 }
3955 }
3956#endif
3957 const DFromV<decltype(v)> d;
3958 alignas(16) T lanes[4];
3959 Store(v, d, lanes);
3960 lanes[i] = t;
3961 return Load(d, lanes);
3962}
3963
3964template <typename T>
3965HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
3966#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3967 if (__builtin_constant_p(i)) {
3968 switch (i) {
3969 case 0:
3970 return detail::InsertLane<0>(v, t);
3971 case 1:
3972 return detail::InsertLane<1>(v, t);
3973 case 2:
3974 return detail::InsertLane<2>(v, t);
3975 case 3:
3976 return detail::InsertLane<3>(v, t);
3977 case 4:
3978 return detail::InsertLane<4>(v, t);
3979 case 5:
3980 return detail::InsertLane<5>(v, t);
3981 case 6:
3982 return detail::InsertLane<6>(v, t);
3983 case 7:
3984 return detail::InsertLane<7>(v, t);
3985 }
3986 }
3987#endif
3988 const DFromV<decltype(v)> d;
3989 alignas(16) T lanes[8];
3990 Store(v, d, lanes);
3991 lanes[i] = t;
3992 return Load(d, lanes);
3993}
3994
3995template <typename T>
3996HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
3997#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang
3998 if (__builtin_constant_p(i)) {
3999 switch (i) {
4000 case 0:
4001 return detail::InsertLane<0>(v, t);
4002 case 1:
4003 return detail::InsertLane<1>(v, t);
4004 case 2:
4005 return detail::InsertLane<2>(v, t);
4006 case 3:
4007 return detail::InsertLane<3>(v, t);
4008 case 4:
4009 return detail::InsertLane<4>(v, t);
4010 case 5:
4011 return detail::InsertLane<5>(v, t);
4012 case 6:
4013 return detail::InsertLane<6>(v, t);
4014 case 7:
4015 return detail::InsertLane<7>(v, t);
4016 case 8:
4017 return detail::InsertLane<8>(v, t);
4018 case 9:
4019 return detail::InsertLane<9>(v, t);
4020 case 10:
4021 return detail::InsertLane<10>(v, t);
4022 case 11:
4023 return detail::InsertLane<11>(v, t);
4024 case 12:
4025 return detail::InsertLane<12>(v, t);
4026 case 13:
4027 return detail::InsertLane<13>(v, t);
4028 case 14:
4029 return detail::InsertLane<14>(v, t);
4030 case 15:
4031 return detail::InsertLane<15>(v, t);
4032 }
4033 }
4034#endif
4035 const DFromV<decltype(v)> d;
4036 alignas(16) T lanes[16];
4037 Store(v, d, lanes);
4038 lanes[i] = t;
4039 return Load(d, lanes);
4040}
4041
4042// ------------------------------ CombineShiftRightBytes
4043
4044template <int kBytes, typename T, class V = Vec128<T>>
4045HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
4046 const Repartition<uint8_t, decltype(d)> d8;
4047 return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
4048 BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
4049}
4050
4051template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
4052 class V = Vec128<T, N>>
4053HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
4054 constexpr size_t kSize = N * sizeof(T);
4055 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
4056 const Repartition<uint8_t, decltype(d)> d8;
4057 const Full128<uint8_t> d_full8;
4058 using V8 = VFromD<decltype(d_full8)>;
4059 const V8 hi8{BitCast(d8, hi).raw};
4060 // Move into most-significant bytes
4061 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
4062 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
4063 return V{BitCast(Full128<T>(), r).raw};
4064}
4065
4066// ------------------------------ Broadcast/splat any lane
4067
4068// Unsigned
4069template <int kLane, size_t N>
4071 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4072 if (kLane < 4) {
4073 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
4074 return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
4075 } else {
4076 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
4077 return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
4078 }
4079}
4080template <int kLane, size_t N>
4082 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4083 return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
4084}
4085template <int kLane, size_t N>
4087 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4088 return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4089}
4090
4091// Signed
4092template <int kLane, size_t N>
4094 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4095 if (kLane < 4) {
4096 const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
4097 return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
4098 } else {
4099 const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
4100 return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
4101 }
4102}
4103template <int kLane, size_t N>
4105 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4106 return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
4107}
4108template <int kLane, size_t N>
4110 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4111 return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
4112}
4113
4114// Float
4115template <int kLane, size_t N>
4117 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4118 return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
4119}
4120template <int kLane, size_t N>
4122 static_assert(0 <= kLane && kLane < N, "Invalid lane");
4123 return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
4124}
4125
4126// ------------------------------ TableLookupLanes (Shuffle01)
4127
4128// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
4129template <typename T, size_t N = 16 / sizeof(T)>
4130struct Indices128 {
4131 __m128i raw;
4132};
4133
4134template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
4135 HWY_IF_LANE_SIZE(T, 4)>
4137 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4138#if HWY_IS_DEBUG_BUILD
4139 const Rebind<TI, decltype(d)> di;
4140 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4141 AllTrue(di, Lt(vec, Set(di, N))));
4142#endif
4143
4144#if HWY_TARGET <= HWY_AVX2
4145 (void)d;
4146 return Indices128<T, N>{vec.raw};
4147#else
4148 const Repartition<uint8_t, decltype(d)> d8;
4149 using V8 = VFromD<decltype(d8)>;
4150 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
4151 0, 1, 2, 3, 0, 1, 2, 3};
4152
4153 // Broadcast each lane index to all 4 bytes of T
4154 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
4155 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4156 const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
4157
4158 // Shift to bytes
4159 const Repartition<uint16_t, decltype(d)> d16;
4160 const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
4161
4162 return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
4163#endif
4164}
4165
4166template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
4167 HWY_IF_LANE_SIZE(T, 8)>
4168HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
4169 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
4170#if HWY_IS_DEBUG_BUILD
4171 const Rebind<TI, decltype(d)> di;
4172 HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
4173 AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
4174#else
4175 (void)d;
4176#endif
4177
4178 // No change - even without AVX3, we can shuffle+blend.
4179 return Indices128<T, N>{vec.raw};
4180}
4181
4182template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
4183HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
4184 const Rebind<TI, decltype(d)> di;
4185 return IndicesFromVec(d, LoadU(di, idx));
4186}
4187
4188template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4190#if HWY_TARGET <= HWY_AVX2
4191 const DFromV<decltype(v)> d;
4192 const RebindToFloat<decltype(d)> df;
4193 const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
4194 return BitCast(d, perm);
4195#else
4196 return TableLookupBytes(v, Vec128<T, N>{idx.raw});
4197#endif
4198}
4199
4200template <size_t N, HWY_IF_GE64(float, N)>
4203#if HWY_TARGET <= HWY_AVX2
4204 return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
4205#else
4206 const DFromV<decltype(v)> df;
4207 const RebindToSigned<decltype(df)> di;
4208 return BitCast(df,
4210#endif
4211}
4212
4213// Single lane: no change
4214template <typename T>
4219
4220template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4222 const Full128<T> d;
4223 Vec128<int64_t> vidx{idx.raw};
4224#if HWY_TARGET <= HWY_AVX2
4225 // There is no _mm_permute[x]var_epi64.
4226 vidx += vidx; // bit1 is the decider (unusual)
4227 const Full128<double> df;
4228 return BitCast(
4229 d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
4230#else
4231 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
4232 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
4233 // to obtain an all-zero or all-one mask.
4234 const Full128<int64_t> di;
4235 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
4236 const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
4237 return IfThenElse(mask_same, v, Shuffle01(v));
4238#endif
4239}
4240
4242 Indices128<double> idx) {
4243 Vec128<int64_t> vidx{idx.raw};
4244#if HWY_TARGET <= HWY_AVX2
4245 vidx += vidx; // bit1 is the decider (unusual)
4246 return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
4247#else
4248 // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
4249 // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
4250 // to obtain an all-zero or all-one mask.
4251 const Full128<double> d;
4252 const Full128<int64_t> di;
4253 const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
4254 const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
4255 return IfThenElse(mask_same, v, Shuffle01(v));
4256#endif
4257}
4258
4259// ------------------------------ ReverseBlocks
4260
4261// Single block: no change
4262template <typename T>
4263HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
4264 return v;
4265}
4266
4267// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
4268
4269// Single lane: no change
4270template <typename T>
4271HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
4272 return v;
4273}
4274
4275// Two lanes: shuffle
4276template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4280
4281template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4282HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4283 return Shuffle01(v);
4284}
4285
4286// Four lanes: shuffle
4287template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4288HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
4289 return Shuffle0123(v);
4290}
4291
4292// 16-bit
4293template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4294HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
4295#if HWY_TARGET <= HWY_AVX3
4296 if (N == 1) return v;
4297 if (N == 2) {
4298 const Repartition<uint32_t, decltype(d)> du32;
4299 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
4300 }
4301 const RebindToSigned<decltype(d)> di;
4302 alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
4303 const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4));
4304 return BitCast(d, Vec128<int16_t, N>{
4305 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4306#else
4307 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
4308 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
4309#endif
4310}
4311
4312// ------------------------------ Reverse2
4313
4314// Single lane: no change
4315template <typename T>
4317 return v;
4318}
4319
4320template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4321HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
4322 alignas(16) const T kShuffle[16] = {1, 0, 3, 2, 5, 4, 7, 6,
4323 9, 8, 11, 10, 13, 12, 15, 14};
4324 return TableLookupBytes(v, Load(d, kShuffle));
4325}
4326
4327template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4328HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
4329 const Repartition<uint32_t, decltype(d)> du32;
4330 return BitCast(d, RotateRight<16>(BitCast(du32, v)));
4331}
4332
4333template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4334HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4335 return Shuffle2301(v);
4336}
4337
4338template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4339HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
4340 return Shuffle01(v);
4341}
4342
4343// ------------------------------ Reverse4
4344
4345template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4346HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
4347 const RebindToSigned<decltype(d)> di;
4348 // 4x 16-bit: a single shufflelo suffices.
4349 if (N == 4) {
4350 return BitCast(d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
4351 BitCast(di, v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
4352 }
4353
4354#if HWY_TARGET <= HWY_AVX3
4355 alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
4356 const Vec128<int16_t, N> idx = Load(di, kReverse4);
4357 return BitCast(d, Vec128<int16_t, N>{
4358 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4359#else
4360 const RepartitionToWide<decltype(di)> dw;
4361 return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
4362#endif
4363}
4364
4365// 4x 32-bit: use Shuffle0123
4366template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4367HWY_API Vec128<T> Reverse4(Full128<T> /* tag */, const Vec128<T> v) {
4368 return Shuffle0123(v);
4369}
4370
4371template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4372HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
4373 HWY_ASSERT(0); // don't have 4 u64 lanes
4374}
4375
4376// ------------------------------ Reverse8
4377
4378template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4379HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
4380#if HWY_TARGET <= HWY_AVX3
4381 const RebindToSigned<decltype(d)> di;
4382 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
4383 15, 14, 13, 12, 11, 10, 9, 8};
4384 const Vec128<int16_t, N> idx = Load(di, kReverse8);
4385 return BitCast(d, Vec128<int16_t, N>{
4386 _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
4387#else
4388 const RepartitionToWide<decltype(d)> dw;
4389 return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
4390#endif
4391}
4392
4393template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
4394HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
4395 HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
4396}
4397
4398// ------------------------------ InterleaveLower
4399
4400// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
4401// the least-significant lane) and "b". To concatenate two half-width integers
4402// into one, use ZipLower/Upper instead (also works with scalar).
4403
4404template <size_t N, HWY_IF_LE128(uint8_t, N)>
4406 const Vec128<uint8_t, N> b) {
4407 return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
4408}
4409template <size_t N, HWY_IF_LE128(uint16_t, N)>
4411 const Vec128<uint16_t, N> b) {
4412 return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
4413}
4414template <size_t N, HWY_IF_LE128(uint32_t, N)>
4416 const Vec128<uint32_t, N> b) {
4417 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
4418}
4419template <size_t N, HWY_IF_LE128(uint64_t, N)>
4421 const Vec128<uint64_t, N> b) {
4422 return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
4423}
4424
4425template <size_t N, HWY_IF_LE128(int8_t, N)>
4427 const Vec128<int8_t, N> b) {
4428 return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
4429}
4430template <size_t N, HWY_IF_LE128(int16_t, N)>
4432 const Vec128<int16_t, N> b) {
4433 return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
4434}
4435template <size_t N, HWY_IF_LE128(int32_t, N)>
4437 const Vec128<int32_t, N> b) {
4438 return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
4439}
4440template <size_t N, HWY_IF_LE128(int64_t, N)>
4442 const Vec128<int64_t, N> b) {
4443 return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
4444}
4445
4446template <size_t N, HWY_IF_LE128(float, N)>
4447HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
4448 const Vec128<float, N> b) {
4449 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
4450}
4451template <size_t N, HWY_IF_LE128(double, N)>
4453 const Vec128<double, N> b) {
4454 return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
4455}
4456
4457// Additional overload for the optional tag (also for 256/512).
4458template <class V>
4459HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
4460 return InterleaveLower(a, b);
4461}
4462
4463// ------------------------------ InterleaveUpper (UpperHalf)
4464
4465// All functions inside detail lack the required D parameter.
4466namespace detail {
4467
4469 const Vec128<uint8_t> b) {
4470 return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
4471}
4473 const Vec128<uint16_t> b) {
4474 return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
4475}
4477 const Vec128<uint32_t> b) {
4478 return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
4479}
4481 const Vec128<uint64_t> b) {
4482 return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
4483}
4484
4486 const Vec128<int8_t> b) {
4487 return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
4488}
4490 const Vec128<int16_t> b) {
4491 return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
4492}
4494 const Vec128<int32_t> b) {
4495 return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
4496}
4498 const Vec128<int64_t> b) {
4499 return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
4500}
4501
4502HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
4503 const Vec128<float> b) {
4504 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
4505}
4507 const Vec128<double> b) {
4508 return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
4509}
4510
4511} // namespace detail
4512
4513// Full
4514template <typename T, class V = Vec128<T>>
4515HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
4516 return detail::InterleaveUpper(a, b);
4517}
4518
4519// Partial
4520template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
4521HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
4522 const Half<decltype(d)> d2;
4523 return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
4524}
4525
4526// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
4527
4528// Same as Interleave*, except that the return lanes are double-width integers;
4529// this is necessary because the single-lane scalar cannot return two values.
4530template <class V, class DW = RepartitionToWide<DFromV<V>>>
4531HWY_API VFromD<DW> ZipLower(V a, V b) {
4532 return BitCast(DW(), InterleaveLower(a, b));
4533}
4534template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4535HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4536 return BitCast(dw, InterleaveLower(D(), a, b));
4537}
4538
4539template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
4540HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4541 return BitCast(dw, InterleaveUpper(D(), a, b));
4542}
4543
4544// ================================================== COMBINE
4545
4546// ------------------------------ Combine (InterleaveLower)
4547
4548// N = N/2 + N/2 (upper half undefined)
4549template <typename T, size_t N, HWY_IF_LE128(T, N)>
4550HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
4551 Vec128<T, N / 2> lo_half) {
4552 const Half<decltype(d)> d2;
4553 const RebindToUnsigned<decltype(d2)> du2;
4554 // Treat half-width input as one lane, and expand to two lanes.
4555 using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
4556 const VU lo{BitCast(du2, lo_half).raw};
4557 const VU hi{BitCast(du2, hi_half).raw};
4558 return BitCast(d, InterleaveLower(lo, hi));
4559}
4560
4561// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
4562
4563// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
4564namespace detail {
4565
4566template <typename T>
4568 Full128<T> /* d */, Vec64<T> lo) {
4569 return Vec128<T>{_mm_move_epi64(lo.raw)};
4570}
4571
4572template <typename T>
4574 Vec64<T> lo) {
4575 const RebindToUnsigned<decltype(d)> du;
4576 return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
4577}
4578
4579} // namespace detail
4580
4581template <typename T>
4583 return detail::ZeroExtendVector(hwy::IsFloatTag<T>(), d, lo);
4584}
4585
4586template <typename T, size_t N, HWY_IF_LE64(T, N)>
4587HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
4588 return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
4589}
4590
4591// ------------------------------ Concat full (InterleaveLower)
4592
4593// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
4594template <typename T>
4595HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4596 const Repartition<uint64_t, decltype(d)> d64;
4597 return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
4598}
4599
4600// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
4601template <typename T>
4602HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4603 const Repartition<uint64_t, decltype(d)> d64;
4604 return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
4605}
4606
4607// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
4608template <typename T>
4609HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
4610 const Vec128<T> lo) {
4611 return CombineShiftRightBytes<8>(d, hi, lo);
4612}
4613
4614// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4615template <typename T>
4617 const Repartition<double, decltype(d)> dd;
4618#if HWY_TARGET == HWY_SSSE3
4619 return BitCast(
4620 d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
4621 _MM_SHUFFLE2(1, 0))});
4622#else
4623 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
4624 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
4625 BitCast(dd, lo).raw, 1)});
4626#endif
4627}
4629 Vec128<float> lo) {
4630#if HWY_TARGET == HWY_SSSE3
4631 (void)d;
4632 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
4633#else
4634 // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4635 const RepartitionToWide<decltype(d)> dd;
4636 return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
4637 BitCast(dd, lo).raw, 1)});
4638#endif
4639}
4642#if HWY_TARGET == HWY_SSSE3
4643 return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
4644#else
4645 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4646 return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
4647#endif
4648}
4649
4650// ------------------------------ Concat partial (Combine, LowerHalf)
4651
4652template <typename T, size_t N, HWY_IF_LE64(T, N)>
4653HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, Vec128<T, N> hi,
4654 Vec128<T, N> lo) {
4655 const Half<decltype(d)> d2;
4656 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
4657}
4658
4659template <typename T, size_t N, HWY_IF_LE64(T, N)>
4660HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, Vec128<T, N> hi,
4661 Vec128<T, N> lo) {
4662 const Half<decltype(d)> d2;
4663 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
4664}
4665
4666template <typename T, size_t N, HWY_IF_LE64(T, N)>
4667HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
4668 const Vec128<T, N> lo) {
4669 const Half<decltype(d)> d2;
4670 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
4671}
4672
4673template <typename T, size_t N, HWY_IF_LE64(T, N)>
4675 Vec128<T, N> lo) {
4676 const Half<decltype(d)> d2;
4677 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
4678}
4679
4680// ------------------------------ ConcatOdd
4681
4682// 8-bit full
4683template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4684HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4685 const Repartition<uint16_t, decltype(d)> dw;
4686 // Right-shift 8 bits per u16 so we can pack.
4687 const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
4688 const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
4689 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4690}
4691
4692// 8-bit x8
4693template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4695 const Repartition<uint32_t, decltype(d)> du32;
4696 // Don't care about upper half, no need to zero.
4697 alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
4698 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
4699 const Vec64<T> L = TableLookupBytes(lo, shuf);
4700 const Vec64<T> H = TableLookupBytes(hi, shuf);
4701 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4702}
4703
4704// 8-bit x4
4705template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4707 const Repartition<uint16_t, decltype(d)> du16;
4708 // Don't care about upper half, no need to zero.
4709 alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
4710 const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
4711 const Vec32<T> L = TableLookupBytes(lo, shuf);
4712 const Vec32<T> H = TableLookupBytes(hi, shuf);
4713 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
4714}
4715
4716// 16-bit full
4717template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4718HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4719 // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
4720 // 0xFFFF8000, which correctly saturates to 0x8000.
4721 const Repartition<int32_t, decltype(d)> dw;
4722 const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
4723 const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
4724 return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
4725}
4726
4727// 16-bit x4
4728template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4730 const Repartition<uint32_t, decltype(d)> du32;
4731 // Don't care about upper half, no need to zero.
4732 alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
4733 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
4734 const Vec64<T> L = TableLookupBytes(lo, shuf);
4735 const Vec64<T> H = TableLookupBytes(hi, shuf);
4736 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4737}
4738
4739// 32-bit full
4740template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4741HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4742 const RebindToFloat<decltype(d)> df;
4743 return BitCast(
4744 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4745 _MM_SHUFFLE(3, 1, 3, 1))});
4746}
4747template <size_t N>
4749 Vec128<float> lo) {
4750 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4751}
4752
4753// Any type x2
4754template <typename T>
4755HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
4756 Vec128<T, 2> lo) {
4757 return InterleaveUpper(d, lo, hi);
4758}
4759
4760// ------------------------------ ConcatEven (InterleaveLower)
4761
4762// 8-bit full
4763template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4764HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4765 const Repartition<uint16_t, decltype(d)> dw;
4766 // Isolate lower 8 bits per u16 so we can pack.
4767 const Vec128<uint16_t> mask = Set(dw, 0x00FF);
4768 const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
4769 const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
4770 return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
4771}
4772
4773// 8-bit x8
4774template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4776 const Repartition<uint32_t, decltype(d)> du32;
4777 // Don't care about upper half, no need to zero.
4778 alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
4779 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
4780 const Vec64<T> L = TableLookupBytes(lo, shuf);
4781 const Vec64<T> H = TableLookupBytes(hi, shuf);
4782 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4783}
4784
4785// 8-bit x4
4786template <typename T, HWY_IF_LANE_SIZE(T, 1)>
4788 const Repartition<uint16_t, decltype(d)> du16;
4789 // Don't care about upper half, no need to zero.
4790 alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
4791 const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
4792 const Vec32<T> L = TableLookupBytes(lo, shuf);
4793 const Vec32<T> H = TableLookupBytes(hi, shuf);
4794 return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
4795}
4796
4797// 16-bit full
4798template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4799HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4800#if HWY_TARGET <= HWY_SSE4
4801 // Isolate lower 16 bits per u32 so we can pack.
4802 const Repartition<uint32_t, decltype(d)> dw;
4803 const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
4804 const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
4805 const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
4806 return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)};
4807#else
4808 // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
4809 // inputs, then concatenate them.
4810 alignas(16) const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
4811 const Vec128<T> shuf = BitCast(d, Load(d, kCompactEvenU16));
4812 const Vec128<T> L = TableLookupBytes(lo, shuf);
4813 const Vec128<T> H = TableLookupBytes(hi, shuf);
4814 return ConcatLowerLower(d, H, L);
4815#endif
4816}
4817
4818// 16-bit x4
4819template <typename T, HWY_IF_LANE_SIZE(T, 2)>
4821 const Repartition<uint32_t, decltype(d)> du32;
4822 // Don't care about upper half, no need to zero.
4823 alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
4824 const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
4825 const Vec64<T> L = TableLookupBytes(lo, shuf);
4826 const Vec64<T> H = TableLookupBytes(hi, shuf);
4827 return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
4828}
4829
4830// 32-bit full
4831template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4832HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4833 const RebindToFloat<decltype(d)> df;
4834 return BitCast(
4835 d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4836 _MM_SHUFFLE(2, 0, 2, 0))});
4837}
4839 Vec128<float> lo) {
4840 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
4841}
4842
4843// Any T x2
4844template <typename T>
4845HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
4846 Vec128<T, 2> lo) {
4847 return InterleaveLower(d, lo, hi);
4848}
4849
4850// ------------------------------ DupEven (InterleaveLower)
4851
4852template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4853HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
4854 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4855}
4856template <size_t N>
4858 return Vec128<float, N>{
4859 _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4860}
4861
4862template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4863HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4864 return InterleaveLower(DFromV<decltype(v)>(), v, v);
4865}
4866
4867// ------------------------------ DupOdd (InterleaveUpper)
4868
4869template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4870HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
4871 return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4872}
4873template <size_t N>
4875 return Vec128<float, N>{
4876 _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4877}
4878
4879template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4880HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4881 return InterleaveUpper(DFromV<decltype(v)>(), v, v);
4882}
4883
4884// ------------------------------ OddEven (IfThenElse)
4885
4886template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4888 const DFromV<decltype(a)> d;
4889 const Repartition<uint8_t, decltype(d)> d8;
4890 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4891 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4892 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4893}
4894
4895template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4896HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4897#if HWY_TARGET == HWY_SSSE3
4898 const DFromV<decltype(a)> d;
4899 const Repartition<uint8_t, decltype(d)> d8;
4900 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4901 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4902 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4903#else
4904 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4905#endif
4906}
4907
4908template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4909HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4910#if HWY_TARGET == HWY_SSSE3
4911 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4912 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4913 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4914#else
4915 // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
4916 const DFromV<decltype(a)> d;
4917 const RebindToFloat<decltype(d)> df;
4918 return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
4919 BitCast(df, b).raw, 5)});
4920#endif
4921}
4922
4923template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4924HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4925 // Same as ConcatUpperLower for full vectors; do not call that because this
4926 // is more efficient for 64x1 vectors.
4927 const DFromV<decltype(a)> d;
4928 const RebindToFloat<decltype(d)> dd;
4929#if HWY_TARGET == HWY_SSSE3
4930 return BitCast(
4931 d, Vec128<double, N>{_mm_shuffle_pd(
4932 BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
4933#else
4934 // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
4935 return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
4936 BitCast(dd, b).raw, 1)});
4937#endif
4938}
4939
4940template <size_t N>
4941HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
4942#if HWY_TARGET == HWY_SSSE3
4943 // SHUFPS must fill the lower half of the output from one input, so we
4944 // need another shuffle. Unpack avoids another immediate byte.
4945 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4946 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4947 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4948#else
4949 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4950#endif
4951}
4952
4953// ------------------------------ OddEvenBlocks
4954template <typename T, size_t N>
4955HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
4956 return even;
4957}
4958
4959// ------------------------------ SwapAdjacentBlocks
4960
4961template <typename T, size_t N>
4962HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
4963 return v;
4964}
4965
4966// ------------------------------ Shl (ZipLower, Mul)
4967
4968// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
4969// two from loading float exponents, which is considerably faster (according
4970// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
4971
4972namespace detail {
4973#if HWY_TARGET > HWY_AVX3 // AVX2 or older
4974
4975// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
4976template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4977HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4978 const DFromV<decltype(v)> d;
4979 const RepartitionToWide<decltype(d)> dw;
4980 const Rebind<float, decltype(dw)> df;
4981 const auto zero = Zero(d);
4982 // Move into exponent (this u16 will become the upper half of an f32)
4983 const auto exp = ShiftLeft<23 - 16>(v);
4984 const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
4985 // Insert 0 into lower halves for reinterpreting as binary32.
4986 const auto f0 = ZipLower(dw, zero, upper);
4987 const auto f1 = ZipUpper(dw, zero, upper);
4988 // See comment below.
4989 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
4990 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
4991 return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4992}
4993
4994// Same, for 32-bit shifts.
4995template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4996HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4997 const DFromV<decltype(v)> d;
4998 const auto exp = ShiftLeft<23>(v);
4999 const auto f = exp + Set(d, 0x3F800000); // 1.0f
5000 // Do not use ConvertTo because we rely on the native 0x80..00 overflow
5001 // behavior. cvt instead of cvtt should be equivalent, but avoids test
5002 // failure under GCC 10.2.1.
5003 return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
5004}
5005
5006#endif // HWY_TARGET > HWY_AVX3
5007
5008template <size_t N>
5010 Vec128<uint16_t, N> bits) {
5011#if HWY_TARGET <= HWY_AVX3
5012 return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
5013#else
5014 return v * Pow2(bits);
5015#endif
5016}
5018 Vec128<uint16_t, 1> bits) {
5019 return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
5020}
5021
5022template <size_t N>
5024 Vec128<uint32_t, N> bits) {
5025#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5026 return v * Pow2(bits);
5027#else
5028 return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
5029#endif
5030}
5032 const Vec128<uint32_t, 1> bits) {
5033 return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
5034}
5035
5037 Vec128<uint64_t> bits) {
5038#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5039 // Individual shifts and combine
5040 const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
5041 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
5042 const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
5043 return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
5044#else
5045 return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
5046#endif
5047}
5049 Vec64<uint64_t> bits) {
5050 return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
5051}
5052
5053// Signed left shift is the same as unsigned.
5054template <typename T, size_t N>
5056 Vec128<T, N> bits) {
5057 const DFromV<decltype(v)> di;
5058 const RebindToUnsigned<decltype(di)> du;
5059 return BitCast(di,
5060 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
5061}
5062
5063} // namespace detail
5064
5065template <typename T, size_t N>
5066HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
5067 return detail::Shl(hwy::TypeTag<T>(), v, bits);
5068}
5069
5070// ------------------------------ Shr (mul, mask, BroadcastSignBit)
5071
5072// Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
5073// widening multiplication by powers of two obtained by loading float exponents,
5074// followed by a constant right-shift. This is still faster than a scalar or
5075// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
5076
5077template <size_t N>
5079 const Vec128<uint16_t, N> bits) {
5080#if HWY_TARGET <= HWY_AVX3
5081 return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
5082#else
5083 const Simd<uint16_t, N, 0> d;
5084 // For bits=0, we cannot mul by 2^16, so fix the result later.
5085 const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
5086 // Replace output with input where bits == 0.
5087 return IfThenElse(bits == Zero(d), in, out);
5088#endif
5089}
5091 const Vec128<uint16_t, 1> bits) {
5092 return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
5093}
5094
5095template <size_t N>
5097 const Vec128<uint32_t, N> bits) {
5098#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5099 // 32x32 -> 64 bit mul, then shift right by 32.
5100 const Simd<uint32_t, N, 0> d32;
5101 // Move odd lanes into position for the second mul. Shuffle more gracefully
5102 // handles N=1 than repartitioning to u64 and shifting 32 bits right.
5103 const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
5104 // For bits=0, we cannot mul by 2^32, so fix the result later.
5105 const auto mul = detail::Pow2(Set(d32, 32) - bits);
5106 const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0
5107 const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
5108 // No need to shift right, already in the correct position.
5109 const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ?
5110 const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
5111 // Replace output with input where bits == 0.
5112 return IfThenElse(bits == Zero(d32), in, out);
5113#else
5114 return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
5115#endif
5116}
5118 const Vec128<uint32_t, 1> bits) {
5119 return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
5120}
5121
5122HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
5123 const Vec128<uint64_t> bits) {
5124#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
5125 // Individual shifts and combine
5126 const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
5127 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
5128 const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
5129 return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
5130#else
5131 return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
5132#endif
5133}
5134HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
5135 const Vec64<uint64_t> bits) {
5136 return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
5137}
5138
5139#if HWY_TARGET > HWY_AVX3 // AVX2 or older
5140namespace detail {
5141
5142// Also used in x86_256-inl.h.
5143template <class DI, class V>
5144HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
5145 const RebindToUnsigned<DI> du;
5146 const auto count = BitCast(du, count_i); // same type as value to shift
5147 // Clear sign and restore afterwards. This is preferable to shifting the MSB
5148 // downwards because Shr is somewhat more expensive than Shl.
5149 const auto sign = BroadcastSignBit(v);
5150 const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below
5151 return BitCast(di, abs >> count) ^ sign;
5152}
5153
5154} // namespace detail
5155#endif // HWY_TARGET > HWY_AVX3
5156
5157template <size_t N>
5159 const Vec128<int16_t, N> bits) {
5160#if HWY_TARGET <= HWY_AVX3
5161 return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
5162#else
5163 return detail::SignedShr(Simd<int16_t, N, 0>(), v, bits);
5164#endif
5165}
5167 const Vec128<int16_t, 1> bits) {
5168 return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
5169}
5170
5171template <size_t N>
5173 const Vec128<int32_t, N> bits) {
5174#if HWY_TARGET <= HWY_AVX3
5175 return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
5176#else
5177 return detail::SignedShr(Simd<int32_t, N, 0>(), v, bits);
5178#endif
5179}
5181 const Vec128<int32_t, 1> bits) {
5182 return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
5183}
5184
5185template <size_t N>
5187 const Vec128<int64_t, N> bits) {
5188#if HWY_TARGET <= HWY_AVX3
5189 return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
5190#else
5191 return detail::SignedShr(Simd<int64_t, N, 0>(), v, bits);
5192#endif
5193}
5194
5195// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
5196
5197HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
5198 const Vec128<uint64_t> b) {
5199 alignas(16) uint64_t mul[2];
5200 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
5201 return Load(Full128<uint64_t>(), mul);
5202}
5203
5204HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
5205 const Vec128<uint64_t> b) {
5206 alignas(16) uint64_t mul[2];
5207 const Half<Full128<uint64_t>> d2;
5208 mul[0] =
5209 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
5210 return Load(Full128<uint64_t>(), mul);
5211}
5212
5213// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5214
5215template <class V, size_t N, class D16 = Simd<bfloat16_t, 2 * N, 0>>
5217 VFromD<D16> b, const V sum0, V& sum1) {
5218 // TODO(janwas): _mm_dpbf16_ps when available
5219 const RebindToUnsigned<decltype(df32)> du32;
5220 // Lane order within sum0/1 is undefined, hence we can avoid the
5221 // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip
5222 // leads to the odd/even order that RearrangeToOddPlusEven prefers.
5223 using VU32 = VFromD<decltype(du32)>;
5224 const VU32 odd = Set(du32, 0xFFFF0000u);
5225 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5226 const VU32 ao = And(BitCast(du32, a), odd);
5227 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5228 const VU32 bo = And(BitCast(du32, b), odd);
5229 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5230 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5231}
5232
5233// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
5234template <size_t N>
5235HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
5236 Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
5237 Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
5238 Vec128<int32_t, N>& /*sum1*/) {
5239 return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(a.raw, b.raw)};
5240}
5241
5242// ------------------------------ RearrangeToOddPlusEven
5243template <size_t N>
5244HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(const Vec128<int32_t, N> sum0,
5245 Vec128<int32_t, N> /*sum1*/) {
5246 return sum0; // invariant already holds
5247}
5248
5249template <class VW>
5250HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
5251 return Add(sum0, sum1);
5252}
5253
5254// ================================================== CONVERT
5255
5256// ------------------------------ Promotions (part w/ narrow lanes -> full)
5257
5258// Unsigned: zero-extend.
5259template <size_t N>
5261 const Vec128<uint8_t, N> v) {
5262#if HWY_TARGET == HWY_SSSE3
5263 const __m128i zero = _mm_setzero_si128();
5264 return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
5265#else
5266 return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
5267#endif
5268}
5269template <size_t N>
5270HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
5271 const Vec128<uint16_t, N> v) {
5272#if HWY_TARGET == HWY_SSSE3
5273 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
5274#else
5275 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
5276#endif
5277}
5278template <size_t N>
5280 const Vec128<uint32_t, N> v) {
5281#if HWY_TARGET == HWY_SSSE3
5282 return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
5283#else
5284 return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
5285#endif
5286}
5287template <size_t N>
5289 const Vec128<uint8_t, N> v) {
5290#if HWY_TARGET == HWY_SSSE3
5291 const __m128i zero = _mm_setzero_si128();
5292 const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
5293 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
5294#else
5295 return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
5296#endif
5297}
5298
5299// Unsigned to signed: same plus cast.
5300template <size_t N>
5305template <size_t N>
5310template <size_t N>
5315
5316// Signed: replicate sign bit.
5317template <size_t N>
5318HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
5319 const Vec128<int8_t, N> v) {
5320#if HWY_TARGET == HWY_SSSE3
5321 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
5322#else
5323 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
5324#endif
5325}
5326template <size_t N>
5327HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
5328 const Vec128<int16_t, N> v) {
5329#if HWY_TARGET == HWY_SSSE3
5330 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
5331#else
5332 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
5333#endif
5334}
5335template <size_t N>
5336HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
5337 const Vec128<int32_t, N> v) {
5338#if HWY_TARGET == HWY_SSSE3
5339 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
5340#else
5341 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
5342#endif
5343}
5344template <size_t N>
5345HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
5346 const Vec128<int8_t, N> v) {
5347#if HWY_TARGET == HWY_SSSE3
5348 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
5349 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
5350 return ShiftRight<24>(Vec128<int32_t, N>{x4});
5351#else
5352 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
5353#endif
5354}
5355
5356// Workaround for origin tracking bug in Clang msan prior to 11.0
5357// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
5358#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
5359#define HWY_INLINE_F16 HWY_NOINLINE
5360#else
5361#define HWY_INLINE_F16 HWY_INLINE
5362#endif
5363template <size_t N>
5365 const Vec128<float16_t, N> v) {
5366#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5367 const RebindToSigned<decltype(df32)> di32;
5368 const RebindToUnsigned<decltype(df32)> du32;
5369 // Expand to u32 so we can shift.
5370 const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
5371 const auto sign = ShiftRight<15>(bits16);
5372 const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
5373 const auto mantissa = bits16 & Set(du32, 0x3FF);
5374 const auto subnormal =
5375 BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
5376 Set(df32, 1.0f / 16384 / 1024));
5377
5378 const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
5379 const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
5380 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
5381 const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
5382 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
5383#else
5384 (void)df32;
5385 return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
5386#endif
5387}
5388
5389template <size_t N>
5390HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
5391 const Vec128<bfloat16_t, N> v) {
5392 const Rebind<uint16_t, decltype(df32)> du16;
5393 const RebindToSigned<decltype(df32)> di32;
5394 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
5395}
5396
5397template <size_t N>
5399 const Vec128<float, N> v) {
5400 return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
5401}
5402
5403template <size_t N>
5405 const Vec128<int32_t, N> v) {
5406 return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
5407}
5408
5409// ------------------------------ Demotions (full -> part w/ narrow lanes)
5410
5411template <size_t N>
5412HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
5413 const Vec128<int32_t, N> v) {
5414#if HWY_TARGET == HWY_SSSE3
5415 const Simd<int32_t, N, 0> di32;
5416 const Simd<uint16_t, N * 2, 0> du16;
5417 const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
5418 const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
5419 const auto clamped = Or(zero_if_neg, too_big);
5420 // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
5421 alignas(16) constexpr uint16_t kLower2Bytes[16] = {
5422 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
5423 const auto lo2 = Load(du16, kLower2Bytes);
5424 return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
5425#else
5426 return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
5427#endif
5428}
5429
5430template <size_t N>
5431HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
5432 const Vec128<int32_t, N> v) {
5433 return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
5434}
5435
5436template <size_t N>
5437HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
5438 const Vec128<int32_t, N> v) {
5439 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
5440 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
5441}
5442
5443template <size_t N>
5444HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
5445 const Vec128<int16_t, N> v) {
5446 return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
5447}
5448
5449template <size_t N>
5450HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
5451 const Vec128<int32_t, N> v) {
5452 const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
5453 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
5454}
5455
5456template <size_t N>
5457HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
5458 const Vec128<int16_t, N> v) {
5459 return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
5460}
5461
5462// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
5463// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
5464HWY_DIAGNOSTICS(push)
5465HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
5466
5467template <size_t N>
5468HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
5469 const Vec128<float, N> v) {
5470#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
5471 const RebindToUnsigned<decltype(df16)> du16;
5472 const Rebind<uint32_t, decltype(df16)> du;
5473 const RebindToSigned<decltype(du)> di;
5474 const auto bits32 = BitCast(du, v);
5475 const auto sign = ShiftRight<31>(bits32);
5476 const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
5477 const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
5478
5479 const auto k15 = Set(di, 15);
5480 const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
5481 const auto is_tiny = exp < Set(di, -24);
5482
5483 const auto is_subnormal = exp < Set(di, -14);
5484 const auto biased_exp16 =
5485 BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
5486 const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
5487 const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
5488 (mantissa32 >> (Set(du, 13) + sub_exp));
5489 const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
5490 ShiftRight<13>(mantissa32)); // <1024
5491
5492 const auto sign16 = ShiftLeft<15>(sign);
5493 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
5494 const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
5495 return BitCast(df16, DemoteTo(du16, bits16));
5496#else
5497 (void)df16;
5498 return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
5499#endif
5500}
5501
5502HWY_DIAGNOSTICS(pop)
5503
5504template <size_t N>
5505HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
5506 const Vec128<float, N> v) {
5507 // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
5508 const Rebind<int32_t, decltype(dbf16)> di32;
5509 const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
5510 const Rebind<uint16_t, decltype(dbf16)> du16;
5511 const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
5512 return BitCast(dbf16, DemoteTo(du16, bits_in_32));
5513}
5514
5515template <size_t N>
5516HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
5517 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
5518 // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
5519 const RebindToUnsigned<decltype(dbf16)> du16;
5520 const Repartition<uint32_t, decltype(dbf16)> du32;
5521 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
5522 return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
5523}
5524
5525// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
5526HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
5527 Vec128<int32_t, 1> a,
5528 Vec128<int32_t, 1> b) {
5529 const Half<decltype(dn)> dnh;
5530 // Pretend the result has twice as many lanes so we can InterleaveLower.
5531 const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
5532 const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
5533 return InterleaveLower(an, bn);
5534}
5535HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
5536 Vec128<int32_t, 2> a,
5537 Vec128<int32_t, 2> b) {
5538 const Half<decltype(dn)> dnh;
5539 // Pretend the result has twice as many lanes so we can InterleaveLower.
5540 const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
5541 const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
5542 return InterleaveLower(an, bn);
5543}
5544HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
5545 Vec128<int32_t> a, Vec128<int32_t> b) {
5546 return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)};
5547}
5548
5549template <size_t N>
5550HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
5551 const Vec128<double, N> v) {
5552 return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
5553}
5554
5555namespace detail {
5556
5557// For well-defined float->int demotion in all x86_*-inl.h.
5558
5559template <size_t N>
5561 -> decltype(Zero(d)) {
5562 // The max can be exactly represented in binary64, so clamping beforehand
5563 // prevents x86 conversion from raising an exception and returning 80..00.
5564 return Min(v, Set(d, 2147483647.0));
5565}
5566
5567// For ConvertTo float->int of same size, clamping before conversion would
5568// change the result because the max integer value is not exactly representable.
5569// Instead detect the overflow result after conversion and fix it.
5570template <class DI, class DF = RebindToFloat<DI>>
5572 decltype(Zero(di).raw) converted_raw)
5573 -> VFromD<DI> {
5574 // Combinations of original and output sign:
5575 // --: normal <0 or -huge_val to 80..00: OK
5576 // -+: -0 to 0 : OK
5577 // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
5578 // ++: normal >0 : OK
5579 const auto converted = VFromD<DI>{converted_raw};
5580 const auto sign_wrong = AndNot(BitCast(di, original), converted);
5581#if HWY_COMPILER_GCC_ACTUAL
5582 // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
5583 // Add() if using that instead. Work around with one more instruction.
5584 const RebindToUnsigned<DI> du;
5585 const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
5586 const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
5587 return IfVecThenElse(mask, max, converted);
5588#else
5589 return Xor(converted, BroadcastSignBit(sign_wrong));
5590#endif
5591}
5592
5593} // namespace detail
5594
5595template <size_t N>
5596HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* tag */,
5597 const Vec128<double, N> v) {
5598 const auto clamped = detail::ClampF64ToI32Max(Simd<double, N, 0>(), v);
5599 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
5600}
5601
5602// For already range-limited input [0, 255].
5603template <size_t N>
5604HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
5605 const Simd<uint32_t, N, 0> d32;
5606 const Simd<uint8_t, N * 4, 0> d8;
5607 alignas(16) static constexpr uint32_t k8From32[4] = {
5608 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
5609 // Also replicate bytes into all 32 bit lanes for safety.
5610 const auto quad = TableLookupBytes(v, Load(d32, k8From32));
5611 return LowerHalf(LowerHalf(BitCast(d8, quad)));
5612}
5613
5614// ------------------------------ Truncations
5615
5616template <typename From, typename To,
5617 hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
5619 const Vec128<From, 1> v) {
5620 static_assert(!IsSigned<To>() && !IsSigned<From>(), "Unsigned only");
5621 const Repartition<To, DFromV<decltype(v)>> d;
5622 const auto v1 = BitCast(d, v);
5623 return Vec128<To, 1>{v1.raw};
5624}
5625
5626HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
5627 const Vec128<uint64_t, 2> v) {
5628 const Full128<uint8_t> d8;
5629 alignas(16) static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8,
5630 0, 8, 0, 8, 0, 8, 0, 8};
5631 return LowerHalf(LowerHalf(LowerHalf(TableLookupBytes(v, Load(d8, kMap)))));
5632}
5633
5634HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
5635 const Vec128<uint64_t, 2> v) {
5636 const Full128<uint16_t> d16;
5637 alignas(16) static constexpr uint16_t kMap[8] = {
5638 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
5639 return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d16, kMap))));
5640}
5641
5642HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
5643 const Vec128<uint64_t, 2> v) {
5644 return Vec128<uint32_t, 2>{_mm_shuffle_epi32(v.raw, 0x88)};
5645}
5646
5647template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
5648HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
5649 const Vec128<uint32_t, N> v) {
5650 const Repartition<uint8_t, DFromV<decltype(v)>> d;
5651 alignas(16) static constexpr uint8_t kMap[16] = {
5652 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
5653 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
5654 return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kMap))));
5655}
5656
5657template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
5658HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
5659 const Vec128<uint32_t, N> v) {
5660 const Repartition<uint16_t, DFromV<decltype(v)>> d;
5661 const auto v1 = BitCast(d, v);
5662 return LowerHalf(ConcatEven(d, v1, v1));
5663}
5664
5665template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
5666HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
5667 const Vec128<uint16_t, N> v) {
5668 const Repartition<uint8_t, DFromV<decltype(v)>> d;
5669 const auto v1 = BitCast(d, v);
5670 return LowerHalf(ConcatEven(d, v1, v1));
5671}
5672
5673// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
5674
5675template <size_t N>
5676HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
5677 const Vec128<int32_t, N> v) {
5678 return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
5679}
5680
5681template <size_t N>
5683 const Vec128<uint32_t, N> v) {
5684#if HWY_TARGET <= HWY_AVX3
5685 return Vec128<float, N>{_mm_cvtepu32_ps(v.raw)};
5686#else
5687 // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
5688 const RebindToUnsigned<decltype(df)> du32;
5689 const RebindToSigned<decltype(df)> d32;
5690
5691 const auto msk_lo = Set(du32, 0xFFFF);
5692 const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
5693
5694 // Extract the 16 lowest/highest significant bits of v and cast to signed int
5695 const auto v_lo = BitCast(d32, And(v, msk_lo));
5696 const auto v_hi = BitCast(d32, ShiftRight<16>(v));
5697 return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
5698#endif
5699}
5700
5701template <size_t N>
5703 const Vec128<int64_t, N> v) {
5704#if HWY_TARGET <= HWY_AVX3
5705 (void)dd;
5706 return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
5707#else
5708 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
5709 const Repartition<uint32_t, decltype(dd)> d32;
5710 const Repartition<uint64_t, decltype(dd)> d64;
5711
5712 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
5713 const auto k84_63 = Set(d64, 0x4530000080000000ULL);
5714 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
5715
5716 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
5717 const auto k52 = Set(d32, 0x43300000);
5718 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
5719
5720 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
5721 return (v_upper - k84_63_52) + v_lower; // order matters!
5722#endif
5723}
5724
5725template <size_t N>
5727 const Vec128<uint64_t, N> v) {
5728#if HWY_TARGET <= HWY_AVX3
5729 return Vec128<double, N>{_mm_cvtepu64_pd(v.raw)};
5730#else
5731 // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
5732 const RebindToUnsigned<decltype(dd)> d64;
5733 using VU = VFromD<decltype(d64)>;
5734
5735 const VU msk_lo = Set(d64, 0xFFFFFFFF);
5736 const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
5737
5738 // Extract the 32 lowest/highest significant bits of v
5739 const VU v_lo = And(v, msk_lo);
5740 const VU v_hi = ShiftRight<32>(v);
5741
5742 auto uint64_to_double128_fast = [&dd](VU w) HWY_ATTR {
5743 w = Or(w, VU{detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
5744 return BitCast(dd, w) - Set(dd, 0x0010000000000000);
5745 };
5746
5747 const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
5748 return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
5749#endif
5750}
5751
5752// Truncates (rounds toward zero).
5753template <size_t N>
5754HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
5755 const Vec128<float, N> v) {
5756 return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
5757}
5758
5759// Full (partial handled below)
5761#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
5762 return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
5763#elif HWY_ARCH_X86_64
5764 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
5765 const Half<Full128<double>> dd2;
5766 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
5767 return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
5768#else
5769 using VI = VFromD<decltype(di)>;
5770 const VI k0 = Zero(di);
5771 const VI k1 = Set(di, 1);
5772 const VI k51 = Set(di, 51);
5773
5774 // Exponent indicates whether the number can be represented as int64_t.
5775 const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
5776 const VI exp = biased_exp - Set(di, 0x3FF);
5777 const auto in_range = exp < Set(di, 63);
5778
5779 // If we were to cap the exponent at 51 and add 2^52, the number would be in
5780 // [2^52, 2^53) and mantissa bits could be read out directly. We need to
5781 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
5782 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
5783 // manually shift the mantissa into place (we already have many of the
5784 // inputs anyway).
5785 const VI shift_mnt = Max(k51 - exp, k0);
5786 const VI shift_int = Max(exp - k51, k0);
5787 const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
5788 // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
5789 const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
5790 // For inputs larger than 2^52, insert zeros at the bottom.
5791 const VI shifted = int52 << shift_int;
5792 // Restore the one bit lost when shifting in the implicit 1-bit.
5793 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
5794
5795 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
5796 const VI sign_mask = BroadcastSignBit(BitCast(di, v));
5797 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
5798 const VI magnitude = IfThenElse(in_range, restored, limit);
5799
5800 // If the input was negative, negate the integer (two's complement).
5801 return (magnitude ^ sign_mask) - sign_mask;
5802#endif
5803}
5805 // Only need to specialize for non-AVX3, 64-bit (single scalar op)
5806#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
5807 const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
5808 return detail::FixConversionOverflow(di, v, i0.raw);
5809#else
5810 (void)di;
5811 const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
5812 return Vec64<int64_t>{full.raw};
5813#endif
5814}
5815
5816template <size_t N>
5817HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
5818 const Simd<int32_t, N, 0> di;
5819 return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
5820}
5821
5822// ------------------------------ Floating-point rounding (ConvertTo)
5823
5824#if HWY_TARGET == HWY_SSSE3
5825
5826// Toward nearest integer, ties to even
5827template <typename T, size_t N>
5828HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
5829 static_assert(IsFloat<T>(), "Only for float");
5830 // Rely on rounding after addition with a large value such that no mantissa
5831 // bits remain (assuming the current mode is nearest-even). We may need a
5832 // compiler flag for precise floating-point to prevent "optimizing" this out.
5833 const Simd<T, N, 0> df;
5834 const auto max = Set(df, MantissaEnd<T>());
5835 const auto large = CopySignToAbs(max, v);
5836 const auto added = large + v;
5837 const auto rounded = added - large;
5838 // Keep original if NaN or the magnitude is large (already an int).
5839 return IfThenElse(Abs(v) < max, rounded, v);
5840}
5841
5842namespace detail {
5843
5844// Truncating to integer and converting back to float is correct except when the
5845// input magnitude is large, in which case the input was already an integer
5846// (because mantissa >> exponent is zero).
5847template <typename T, size_t N>
5849 static_assert(IsFloat<T>(), "Only for float");
5850 return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
5851}
5852
5853} // namespace detail
5854
5855// Toward zero, aka truncate
5856template <typename T, size_t N>
5857HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
5858 static_assert(IsFloat<T>(), "Only for float");
5859 const Simd<T, N, 0> df;
5860 const RebindToSigned<decltype(df)> di;
5861
5862 const auto integer = ConvertTo(di, v); // round toward 0
5863 const auto int_f = ConvertTo(df, integer);
5864
5865 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
5866}
5867
5868// Toward +infinity, aka ceiling
5869template <typename T, size_t N>
5871 static_assert(IsFloat<T>(), "Only for float");
5872 const Simd<T, N, 0> df;
5873 const RebindToSigned<decltype(df)> di;
5874
5875 const auto integer = ConvertTo(di, v); // round toward 0
5876 const auto int_f = ConvertTo(df, integer);
5877
5878 // Truncating a positive non-integer ends up smaller; if so, add 1.
5879 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
5880
5881 return IfThenElse(detail::UseInt(v), int_f - neg1, v);
5882}
5883
5884// Toward -infinity, aka floor
5885template <typename T, size_t N>
5887 static_assert(IsFloat<T>(), "Only for float");
5888 const Simd<T, N, 0> df;
5889 const RebindToSigned<decltype(df)> di;
5890
5891 const auto integer = ConvertTo(di, v); // round toward 0
5892 const auto int_f = ConvertTo(df, integer);
5893
5894 // Truncating a negative non-integer ends up larger; if so, subtract 1.
5895 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
5896
5897 return IfThenElse(detail::UseInt(v), int_f + neg1, v);
5898}
5899
5900#else
5901
5902// Toward nearest integer, ties to even
5903template <size_t N>
5904HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
5905 return Vec128<float, N>{
5906 _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5907}
5908template <size_t N>
5909HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
5910 return Vec128<double, N>{
5911 _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5912}
5913
5914// Toward zero, aka truncate
5915template <size_t N>
5916HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
5917 return Vec128<float, N>{
5918 _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5919}
5920template <size_t N>
5921HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
5922 return Vec128<double, N>{
5923 _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5924}
5925
5926// Toward +infinity, aka ceiling
5927template <size_t N>
5928HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
5929 return Vec128<float, N>{
5930 _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5931}
5932template <size_t N>
5933HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
5934 return Vec128<double, N>{
5935 _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5936}
5937
5938// Toward -infinity, aka floor
5939template <size_t N>
5940HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
5941 return Vec128<float, N>{
5942 _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5943}
5944template <size_t N>
5945HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
5946 return Vec128<double, N>{
5947 _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5948}
5949
5950#endif // !HWY_SSSE3
5951
5952// ------------------------------ Floating-point classification
5953
5954template <size_t N>
5956#if HWY_TARGET <= HWY_AVX3
5957 return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x81)};
5958#else
5959 return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
5960#endif
5961}
5962template <size_t N>
5964#if HWY_TARGET <= HWY_AVX3
5965 return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x81)};
5966#else
5967 return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
5968#endif
5969}
5970
5971#if HWY_TARGET <= HWY_AVX3
5972
5973template <size_t N>
5975 return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x18)};
5976}
5977template <size_t N>
5979 return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x18)};
5980}
5981
5982// Returns whether normal/subnormal/zero.
5983template <size_t N>
5985 // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
5986 // and negate the mask.
5987 return Not(Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x99)});
5988}
5989template <size_t N>
5991 return Not(Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x99)});
5992}
5993
5994#else
5995
5996template <typename T, size_t N>
5997HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
5998 static_assert(IsFloat<T>(), "Only for float");
5999 const Simd<T, N, 0> d;
6000 const RebindToSigned<decltype(d)> di;
6001 const VFromD<decltype(di)> vi = BitCast(di, v);
6002 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
6003 return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
6004}
6005
6006// Returns whether normal/subnormal/zero.
6007template <typename T, size_t N>
6008HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
6009 static_assert(IsFloat<T>(), "Only for float");
6010 const Simd<T, N, 0> d;
6011 const RebindToUnsigned<decltype(d)> du;
6012 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
6013 const VFromD<decltype(du)> vu = BitCast(du, v);
6014 // Shift left to clear the sign bit, then right so we can compare with the
6015 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
6016 // negative and non-negative floats would be greater). MSVC seems to generate
6017 // incorrect code if we instead add vu + vu.
6018 const VFromD<decltype(di)> exp =
6019 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
6020 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
6021}
6022
6023#endif // HWY_TARGET <= HWY_AVX3
6024
6025// ================================================== CRYPTO
6026
6027#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
6028
6029// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
6030#ifdef HWY_NATIVE_AES
6031#undef HWY_NATIVE_AES
6032#else
6033#define HWY_NATIVE_AES
6034#endif
6035
6036HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
6037 Vec128<uint8_t> round_key) {
6038 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
6039}
6040
6041HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
6042 Vec128<uint8_t> round_key) {
6043 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
6044}
6045
6046template <size_t N, HWY_IF_LE128(uint64_t, N)>
6047HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
6048 Vec128<uint64_t, N> b) {
6049 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
6050}
6051
6052template <size_t N, HWY_IF_LE128(uint64_t, N)>
6053HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
6054 Vec128<uint64_t, N> b) {
6055 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
6056}
6057
6058#endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
6059
6060// ================================================== MISC
6061
6062// ------------------------------ LoadMaskBits (TestBit)
6063
6064#if HWY_TARGET > HWY_AVX3
6065namespace detail {
6066
6067template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
6068HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6069 const RebindToUnsigned<decltype(d)> du;
6070 // Easier than Set(), which would require an >8-bit type, which would not
6071 // compile for T=uint8_t, N=1.
6072 const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
6073
6074 // Replicate bytes 8x such that each byte contains the bit that governs it.
6075 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
6076 1, 1, 1, 1, 1, 1, 1, 1};
6077 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
6078
6079 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
6080 1, 2, 4, 8, 16, 32, 64, 128};
6081 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
6082}
6083
6084template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6085HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6086 const RebindToUnsigned<decltype(d)> du;
6087 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
6088 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
6089 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
6090}
6091
6092template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
6093HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6094 const RebindToUnsigned<decltype(d)> du;
6095 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
6096 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
6097 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
6098}
6099
6100template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6101HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6102 const RebindToUnsigned<decltype(d)> du;
6103 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
6104 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
6105}
6106
6107} // namespace detail
6108#endif // HWY_TARGET > HWY_AVX3
6109
6110// `p` points to at least 8 readable bytes, not all of which need be valid.
6111template <typename T, size_t N, HWY_IF_LE128(T, N)>
6112HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
6113 const uint8_t* HWY_RESTRICT bits) {
6114#if HWY_TARGET <= HWY_AVX3
6115 (void)d;
6116 uint64_t mask_bits = 0;
6117 constexpr size_t kNumBytes = (N + 7) / 8;
6118 CopyBytes<kNumBytes>(bits, &mask_bits);
6119 if (N < 8) {
6120 mask_bits &= (1ull << N) - 1;
6121 }
6122
6123 return Mask128<T, N>::FromBits(mask_bits);
6124#else
6125 uint64_t mask_bits = 0;
6126 constexpr size_t kNumBytes = (N + 7) / 8;
6127 CopyBytes<kNumBytes>(bits, &mask_bits);
6128 if (N < 8) {
6129 mask_bits &= (1ull << N) - 1;
6130 }
6131
6132 return detail::LoadMaskBits(d, mask_bits);
6133#endif
6134}
6135
6136template <typename T>
6137struct CompressIsPartition {
6138#if HWY_TARGET <= HWY_AVX3
6139 // AVX3 supports native compress, but a table-based approach allows
6140 // 'partitioning' (also moving mask=false lanes to the top), which helps
6141 // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
6142 // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
6143 // u32x8 etc.).
6144 enum { value = (sizeof(T) == 8) };
6145#else
6146 // generic_ops-inl does not guarantee IsPartition for 8-bit.
6147 enum { value = (sizeof(T) != 1) };
6148#endif
6149};
6150
6151#if HWY_TARGET <= HWY_AVX3
6152
6153// ------------------------------ StoreMaskBits
6154
6155// `p` points to at least 8 writable bytes.
6156template <typename T, size_t N>
6157HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
6158 const Mask128<T, N> mask, uint8_t* bits) {
6159 constexpr size_t kNumBytes = (N + 7) / 8;
6160 CopyBytes<kNumBytes>(&mask.raw, bits);
6161
6162 // Non-full byte, need to clear the undefined upper bits.
6163 if (N < 8) {
6164 const int mask_bits = (1 << N) - 1;
6165 bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
6166 }
6167
6168 return kNumBytes;
6169}
6170
6171// ------------------------------ Mask testing
6172
6173// Beware: the suffix indicates the number of mask bits, not lane size!
6174
6175template <typename T, size_t N>
6176HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
6177 const Mask128<T, N> mask) {
6178 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
6179 return PopCount(mask_bits);
6180}
6181
6182template <typename T, size_t N>
6183HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
6184 const Mask128<T, N> mask) {
6185 const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
6186 return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
6187}
6188
6189template <typename T, size_t N>
6190HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
6191 const Mask128<T, N> mask) {
6192 const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
6193 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
6194}
6195
6196template <typename T, size_t N>
6197HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6198 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
6199 return mask_bits == 0;
6200}
6201
6202template <typename T, size_t N>
6203HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6204 const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
6205 // Cannot use _kortestc because we may have less than 8 mask bits.
6206 return mask_bits == (1u << N) - 1;
6207}
6208
6209// ------------------------------ Compress
6210
6211// 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512.
6212
6213// Single lane: no-op
6214template <typename T>
6215HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6216 return v;
6217}
6218
6219template <size_t N, HWY_IF_GE64(float, N)>
6221 return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
6222}
6223
6224template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6225HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
6226 HWY_DASSERT(mask.raw < 4);
6227
6228 // There are only 2 lanes, so we can afford to load the index vector directly.
6229 alignas(16) constexpr uint8_t u8_indices[64] = {
6230 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6231 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6232 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6234
6235 const Full128<T> d;
6236 const Repartition<uint8_t, decltype(d)> d8;
6237 const auto index = Load(d8, u8_indices + 16 * mask.raw);
6238 return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
6239}
6240
6241// ------------------------------ CompressNot (Compress)
6242
6243// Single lane: no-op
6244template <typename T>
6245HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6246 return v;
6247}
6248
6249template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6250HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
6251 // See CompressIsPartition, PrintCompressNot64x2NibbleTables
6252 alignas(16) constexpr uint64_t packed_array[16] = {0x00000010, 0x00000001,
6253 0x00000010, 0x00000010};
6254
6255 // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
6256 // _mm_permutexvar_epi64 will ignore the upper bits.
6257 const Full128<T> d;
6258 const RebindToUnsigned<decltype(d)> du64;
6259 const auto packed = Set(du64, packed_array[mask.raw]);
6260 alignas(16) constexpr uint64_t shifts[2] = {0, 4};
6261 const auto indices = Indices128<T>{(packed >> Load(du64, shifts)).raw};
6262 return TableLookupLanes(v, indices);
6263}
6264
6265// ------------------------------ CompressBlocksNot
6266HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6267 Mask128<uint64_t> /* m */) {
6268 return v;
6269}
6270
6271// ------------------------------ CompressStore
6272
6273template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
6274HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
6275 Simd<T, N, 0> /* tag */,
6276 T* HWY_RESTRICT unaligned) {
6277 _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
6278 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6279 detail::MaybeUnpoison(unaligned, count);
6280 return count;
6281}
6282
6283template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
6284HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
6285 Simd<T, N, 0> /* tag */,
6286 T* HWY_RESTRICT unaligned) {
6287 _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
6288 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6289 detail::MaybeUnpoison(unaligned, count);
6290 return count;
6291}
6292
6293template <size_t N, HWY_IF_LE128(float, N)>
6295 Simd<float, N, 0> /* tag */,
6296 float* HWY_RESTRICT unaligned) {
6297 _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
6298 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6299 detail::MaybeUnpoison(unaligned, count);
6300 return count;
6301}
6302
6303template <size_t N, HWY_IF_LE128(double, N)>
6305 Simd<double, N, 0> /* tag */,
6306 double* HWY_RESTRICT unaligned) {
6307 _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
6308 const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
6309 detail::MaybeUnpoison(unaligned, count);
6310 return count;
6311}
6312
6313// ------------------------------ CompressBlendedStore (CompressStore)
6314template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6315HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
6316 Simd<T, N, 0> d,
6317 T* HWY_RESTRICT unaligned) {
6318 // AVX-512 already does the blending at no extra cost (latency 11,
6319 // rthroughput 2 - same as compress plus store).
6320 if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
6321 // We're relying on the mask to blend. Clear the undefined upper bits.
6322 if (N != 16 / sizeof(T)) {
6323 m = And(m, FirstN(d, N));
6324 }
6325 return CompressStore(v, m, d, unaligned);
6326 } else {
6327 const size_t count = CountTrue(d, m);
6328 const Vec128<T, N> compressed = Compress(v, m);
6329#if HWY_MEM_OPS_MIGHT_FAULT
6330 // BlendedStore tests mask for each lane, but we know that the mask is
6331 // FirstN, so we can just copy.
6332 alignas(16) T buf[N];
6333 Store(compressed, d, buf);
6334 memcpy(unaligned, buf, count * sizeof(T));
6335#else
6336 BlendedStore(compressed, FirstN(d, count), d, unaligned);
6337#endif
6338 detail::MaybeUnpoison(unaligned, count);
6339 return count;
6340 }
6341}
6342
6343// ------------------------------ CompressBitsStore (LoadMaskBits)
6344
6345template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6346HWY_API size_t CompressBitsStore(Vec128<T, N> v,
6347 const uint8_t* HWY_RESTRICT bits,
6348 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6349 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
6350}
6351
6352#else // AVX2 or below
6353
6354// ------------------------------ StoreMaskBits
6355
6356namespace detail {
6357
6358constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
6359 return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
6360}
6361
6362template <typename T, size_t N>
6363HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
6364 const Mask128<T, N> mask) {
6365 const Simd<T, N, 0> d;
6366 const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
6367 return U64FromInt(_mm_movemask_epi8(sign_bits));
6368}
6369
6370template <typename T, size_t N>
6371HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
6372 const Mask128<T, N> mask) {
6373 // Remove useless lower half of each u16 while preserving the sign bit.
6374 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
6375 return U64FromInt(_mm_movemask_epi8(sign_bits));
6376}
6377
6378template <typename T, size_t N>
6379HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
6380 const Mask128<T, N> mask) {
6381 const Simd<T, N, 0> d;
6382 const Simd<float, N, 0> df;
6383 const auto sign_bits = BitCast(df, VecFromMask(d, mask));
6384 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
6385}
6386
6387template <typename T, size_t N>
6388HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
6389 const Mask128<T, N> mask) {
6390 const Simd<T, N, 0> d;
6391 const Simd<double, N, 0> df;
6392 const auto sign_bits = BitCast(df, VecFromMask(d, mask));
6393 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
6394}
6395
6396// Returns the lowest N of the _mm_movemask* bits.
6397template <typename T, size_t N>
6398constexpr uint64_t OnlyActive(uint64_t mask_bits) {
6399 return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
6400}
6401
6402template <typename T, size_t N>
6403HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
6404 return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
6405}
6406
6407} // namespace detail
6408
6409// `p` points to at least 8 writable bytes.
6410template <typename T, size_t N>
6411HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
6412 const Mask128<T, N> mask, uint8_t* bits) {
6413 constexpr size_t kNumBytes = (N + 7) / 8;
6414 const uint64_t mask_bits = detail::BitsFromMask(mask);
6415 CopyBytes<kNumBytes>(&mask_bits, bits);
6416 return kNumBytes;
6417}
6418
6419// ------------------------------ Mask testing
6420
6421template <typename T, size_t N>
6422HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6423 // Cheaper than PTEST, which is 2 uop / 3L.
6424 return detail::BitsFromMask(mask) == 0;
6425}
6426
6427template <typename T, size_t N>
6428HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
6429 constexpr uint64_t kAllBits =
6430 detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
6431 return detail::BitsFromMask(mask) == kAllBits;
6432}
6433
6434template <typename T, size_t N>
6435HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
6436 const Mask128<T, N> mask) {
6437 return PopCount(detail::BitsFromMask(mask));
6438}
6439
6440template <typename T, size_t N>
6441HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
6442 const Mask128<T, N> mask) {
6443 const uint64_t mask_bits = detail::BitsFromMask(mask);
6444 return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
6445}
6446
6447template <typename T, size_t N>
6448HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
6449 const Mask128<T, N> mask) {
6450 const uint64_t mask_bits = detail::BitsFromMask(mask);
6451 return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
6452}
6453
6454// ------------------------------ Compress, CompressBits
6455
6456namespace detail {
6457
6458// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
6459template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6460HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6461 HWY_DASSERT(mask_bits < 256);
6462 const Rebind<uint8_t, decltype(d)> d8;
6463 const Simd<uint16_t, N, 0> du;
6464
6465 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
6466 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
6467 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
6468 // store lane indices and convert to byte indices (2*lane + 0..1), with the
6469 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
6470 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
6471 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
6472 // is likely more costly than the higher cache footprint from storing bytes.
6473 alignas(16) constexpr uint8_t table[2048] = {
6474 // PrintCompress16x8Tables
6475 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6476 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6477 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
6478 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6479 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
6480 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
6481 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
6482 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6483 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
6484 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
6485 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
6486 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
6487 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
6488 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
6489 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
6490 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6491 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
6492 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
6493 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
6494 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
6495 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
6496 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
6497 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
6498 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
6499 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
6500 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
6501 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
6502 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
6503 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
6504 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
6505 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
6506 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6507 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
6508 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
6509 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
6510 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
6511 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
6512 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
6513 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
6514 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
6515 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
6516 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
6517 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
6518 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
6519 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
6520 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
6521 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
6522 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
6523 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
6524 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
6525 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
6526 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
6527 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
6528 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
6529 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
6530 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
6531 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
6532 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
6533 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
6534 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
6535 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
6536 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
6537 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
6538 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
6539 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
6540 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
6541 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
6542 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
6543 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
6544 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
6545 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
6546 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
6547 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
6548 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
6549 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
6550 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
6551 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
6552 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
6553 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
6554 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
6555 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
6556 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
6557 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
6558 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
6559 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
6560 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
6561 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
6562 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
6563 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
6564 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
6565 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
6566 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
6567 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
6568 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
6569 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
6570 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
6571 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
6572 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
6573 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
6574 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
6575 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
6576 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
6577 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
6578 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
6579 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
6580 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
6581 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
6582 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
6583 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
6584 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
6585 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
6586 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
6587 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
6588 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
6589 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
6590 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
6591 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
6592 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
6593 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
6594 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
6595 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
6596 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
6597 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
6598 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
6599 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
6600 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
6601 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
6602 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
6603
6604 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
6605 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6606 return BitCast(d, pairs + Set(du, 0x0100));
6607}
6608
6609template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
6610HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6611 uint64_t mask_bits) {
6612 HWY_DASSERT(mask_bits < 256);
6613 const Rebind<uint8_t, decltype(d)> d8;
6614 const Simd<uint16_t, N, 0> du;
6615
6616 // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
6617 // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
6618 // 8 mask bits). Loading them directly would require 4 KiB. We can instead
6619 // store lane indices and convert to byte indices (2*lane + 0..1), with the
6620 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
6621 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
6622 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
6623 // is likely more costly than the higher cache footprint from storing bytes.
6624 alignas(16) constexpr uint8_t table[2048] = {
6625 // PrintCompressNot16x8Tables
6626 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, //
6627 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, //
6628 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, //
6629 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, //
6630 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, //
6631 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, //
6632 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, //
6633 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, //
6634 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, //
6635 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, //
6636 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, //
6637 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, //
6638 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, //
6639 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, //
6640 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, //
6641 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, //
6642 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, //
6643 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, //
6644 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, //
6645 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, //
6646 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, //
6647 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, //
6648 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, //
6649 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, //
6650 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, //
6651 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, //
6652 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, //
6653 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, //
6654 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, //
6655 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, //
6656 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, //
6657 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, //
6658 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, //
6659 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, //
6660 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, //
6661 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, //
6662 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, //
6663 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, //
6664 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, //
6665 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, //
6666 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, //
6667 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, //
6668 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, //
6669 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, //
6670 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, //
6671 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, //
6672 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, //
6673 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, //
6674 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, //
6675 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, //
6676 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, //
6677 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, //
6678 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, //
6679 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, //
6680 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, //
6681 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, //
6682 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, //
6683 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, //
6684 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, //
6685 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, //
6686 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, //
6687 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, //
6688 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, //
6689 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, //
6690 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, //
6691 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, //
6692 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, //
6693 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, //
6694 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, //
6695 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, //
6696 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, //
6697 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, //
6698 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, //
6699 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, //
6700 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, //
6701 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, //
6702 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, //
6703 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, //
6704 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, //
6705 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, //
6706 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, //
6707 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, //
6708 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, //
6709 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, //
6710 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, //
6711 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, //
6712 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, //
6713 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, //
6714 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, //
6715 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, //
6716 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, //
6717 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, //
6718 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, //
6719 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, //
6720 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, //
6721 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, //
6722 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, //
6723 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, //
6724 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, //
6725 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, //
6726 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, //
6727 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, //
6728 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, //
6729 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, //
6730 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, //
6731 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, //
6732 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, //
6733 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, //
6734 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, //
6735 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, //
6736 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, //
6737 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, //
6738 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, //
6739 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, //
6740 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, //
6741 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, //
6742 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, //
6743 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, //
6744 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, //
6745 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, //
6746 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, //
6747 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, //
6748 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, //
6749 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, //
6750 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, //
6751 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, //
6752 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, //
6753 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
6754
6755 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
6756 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
6757 return BitCast(d, pairs + Set(du, 0x0100));
6758}
6759
6760template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6761HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6762 HWY_DASSERT(mask_bits < 16);
6763
6764 // There are only 4 lanes, so we can afford to load the index vector directly.
6765 alignas(16) constexpr uint8_t u8_indices[256] = {
6766 // PrintCompress32x4Tables
6767 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6769 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
6770 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6771 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
6772 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
6773 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
6774 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
6775 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
6776 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
6777 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
6778 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
6779 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
6780 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
6781 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
6782 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6783
6784 const Repartition<uint8_t, decltype(d)> d8;
6785 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6786}
6787
6788template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
6789HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6790 uint64_t mask_bits) {
6791 HWY_DASSERT(mask_bits < 16);
6792
6793 // There are only 4 lanes, so we can afford to load the index vector directly.
6794 alignas(16) constexpr uint8_t u8_indices[256] = {
6795 // PrintCompressNot32x4Tables
6796 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6797 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
6798 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
6799 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
6800 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
6801 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
6802 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6803 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6804 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
6805 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
6806 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6807 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
6808 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
6809 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
6810 12, 13, 14, 15};
6811
6812 const Repartition<uint8_t, decltype(d)> d8;
6813 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6814}
6815
6816template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6817HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
6818 HWY_DASSERT(mask_bits < 4);
6819
6820 // There are only 2 lanes, so we can afford to load the index vector directly.
6821 alignas(16) constexpr uint8_t u8_indices[64] = {
6822 // PrintCompress64x2Tables
6823 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6824 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6825 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6826 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6827
6828 const Repartition<uint8_t, decltype(d)> d8;
6829 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6830}
6831
6832template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
6833HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
6834 uint64_t mask_bits) {
6835 HWY_DASSERT(mask_bits < 4);
6836
6837 // There are only 2 lanes, so we can afford to load the index vector directly.
6838 alignas(16) constexpr uint8_t u8_indices[64] = {
6839 // PrintCompressNot64x2Tables
6840 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6841 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
6842 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
6843 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
6844
6845 const Repartition<uint8_t, decltype(d)> d8;
6846 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
6847}
6848
6849template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6850HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
6851 const Simd<T, N, 0> d;
6852 const RebindToUnsigned<decltype(d)> du;
6853
6854 HWY_DASSERT(mask_bits < (1ull << N));
6855 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6856 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6857}
6858
6859template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6860HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
6861 const Simd<T, N, 0> d;
6862 const RebindToUnsigned<decltype(d)> du;
6863
6864 HWY_DASSERT(mask_bits < (1ull << N));
6865 const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits));
6866 return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6867}
6868
6869} // namespace detail
6870
6871// Single lane: no-op
6872template <typename T>
6873HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6874 return v;
6875}
6876
6877// Two lanes: conditional swap
6878template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6879HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
6880 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
6881 const Full128<T> d;
6882 const Vec128<T> m = VecFromMask(d, mask);
6883 const Vec128<T> maskL = DupEven(m);
6884 const Vec128<T> maskH = DupOdd(m);
6885 const Vec128<T> swap = AndNot(maskL, maskH);
6886 return IfVecThenElse(swap, Shuffle01(v), v);
6887}
6888
6889// General case, 2 or 4 bytes
6890template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6891HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
6892 return detail::CompressBits(v, detail::BitsFromMask(mask));
6893}
6894
6895// Single lane: no-op
6896template <typename T>
6897HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
6898 return v;
6899}
6900
6901// Two lanes: conditional swap
6902template <typename T, HWY_IF_LANE_SIZE(T, 8)>
6903HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
6904 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
6905 const Full128<T> d;
6906 const Vec128<T> m = VecFromMask(d, mask);
6907 const Vec128<T> maskL = DupEven(m);
6908 const Vec128<T> maskH = DupOdd(m);
6909 const Vec128<T> swap = AndNot(maskH, maskL);
6910 return IfVecThenElse(swap, Shuffle01(v), v);
6911}
6912
6913// General case, 2 or 4 bytes
6914template <typename T, size_t N, HWY_IF_LANE_SIZE_ONE_OF(T, 0x14)>
6915HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
6916 // For partial vectors, we cannot pull the Not() into the table because
6917 // BitsFromMask clears the upper bits.
6918 if (N < 16 / sizeof(T)) {
6919 return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
6920 }
6921 return detail::CompressNotBits(v, detail::BitsFromMask(mask));
6922}
6923
6924// ------------------------------ CompressBlocksNot
6925HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
6926 Mask128<uint64_t> /* m */) {
6927 return v;
6928}
6929
6930template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6931HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
6932 const uint8_t* HWY_RESTRICT bits) {
6933 uint64_t mask_bits = 0;
6934 constexpr size_t kNumBytes = (N + 7) / 8;
6935 CopyBytes<kNumBytes>(bits, &mask_bits);
6936 if (N < 8) {
6937 mask_bits &= (1ull << N) - 1;
6938 }
6939
6940 return detail::CompressBits(v, mask_bits);
6941}
6942
6943// ------------------------------ CompressStore, CompressBitsStore
6944
6945template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6946HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
6947 T* HWY_RESTRICT unaligned) {
6948 const RebindToUnsigned<decltype(d)> du;
6949
6950 const uint64_t mask_bits = detail::BitsFromMask(m);
6951 HWY_DASSERT(mask_bits < (1ull << N));
6952 const size_t count = PopCount(mask_bits);
6953
6954 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6955 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6956 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6957 StoreU(compressed, d, unaligned);
6958 detail::MaybeUnpoison(unaligned, count);
6959 return count;
6960}
6961
6962template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6963HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
6964 Simd<T, N, 0> d,
6965 T* HWY_RESTRICT unaligned) {
6966 const RebindToUnsigned<decltype(d)> du;
6967
6968 const uint64_t mask_bits = detail::BitsFromMask(m);
6969 HWY_DASSERT(mask_bits < (1ull << N));
6970 const size_t count = PopCount(mask_bits);
6971
6972 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6973 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6974 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6975 BlendedStore(compressed, FirstN(d, count), d, unaligned);
6976 detail::MaybeUnpoison(unaligned, count);
6977 return count;
6978}
6979
6980template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
6981HWY_API size_t CompressBitsStore(Vec128<T, N> v,
6982 const uint8_t* HWY_RESTRICT bits,
6983 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
6984 const RebindToUnsigned<decltype(d)> du;
6985
6986 uint64_t mask_bits = 0;
6987 constexpr size_t kNumBytes = (N + 7) / 8;
6988 CopyBytes<kNumBytes>(bits, &mask_bits);
6989 if (N < 8) {
6990 mask_bits &= (1ull << N) - 1;
6991 }
6992 const size_t count = PopCount(mask_bits);
6993
6994 // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
6995 const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
6996 const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
6997 StoreU(compressed, d, unaligned);
6998
6999 detail::MaybeUnpoison(unaligned, count);
7000 return count;
7001}
7002
7003#endif // HWY_TARGET <= HWY_AVX3
7004
7005// ------------------------------ StoreInterleaved2/3/4
7006
7007// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
7008// generic_ops-inl.h.
7009
7010// ------------------------------ Reductions
7011
7012namespace detail {
7013
7014// N=1 for any T: no-op
7015template <typename T>
7016HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7017 const Vec128<T, 1> v) {
7018 return v;
7019}
7020template <typename T>
7021HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7022 const Vec128<T, 1> v) {
7023 return v;
7024}
7025template <typename T>
7026HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
7027 const Vec128<T, 1> v) {
7028 return v;
7029}
7030
7031// u32/i32/f32:
7032
7033// N=2
7034template <typename T>
7035HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
7036 const Vec128<T, 2> v10) {
7037 return v10 + Shuffle2301(v10);
7038}
7039template <typename T>
7040HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
7041 const Vec128<T, 2> v10) {
7042 return Min(v10, Shuffle2301(v10));
7043}
7044template <typename T>
7045HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
7046 const Vec128<T, 2> v10) {
7047 return Max(v10, Shuffle2301(v10));
7048}
7049
7050// N=4 (full)
7051template <typename T>
7052HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
7053 const Vec128<T> v3210) {
7054 const Vec128<T> v1032 = Shuffle1032(v3210);
7055 const Vec128<T> v31_20_31_20 = v3210 + v1032;
7056 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
7057 return v20_31_20_31 + v31_20_31_20;
7058}
7059template <typename T>
7060HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
7061 const Vec128<T> v3210) {
7062 const Vec128<T> v1032 = Shuffle1032(v3210);
7063 const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
7064 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
7065 return Min(v20_31_20_31, v31_20_31_20);
7066}
7067template <typename T>
7068HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
7069 const Vec128<T> v3210) {
7070 const Vec128<T> v1032 = Shuffle1032(v3210);
7071 const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
7072 const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
7073 return Max(v20_31_20_31, v31_20_31_20);
7074}
7075
7076// u64/i64/f64:
7077
7078// N=2 (full)
7079template <typename T>
7080HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
7081 const Vec128<T> v10) {
7082 const Vec128<T> v01 = Shuffle01(v10);
7083 return v10 + v01;
7084}
7085template <typename T>
7086HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
7087 const Vec128<T> v10) {
7088 const Vec128<T> v01 = Shuffle01(v10);
7089 return Min(v10, v01);
7090}
7091template <typename T>
7092HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
7093 const Vec128<T> v10) {
7094 const Vec128<T> v01 = Shuffle01(v10);
7095 return Max(v10, v01);
7096}
7097
7098template <size_t N, HWY_IF_GE32(uint16_t, N)>
7099HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
7100 Vec128<uint16_t, N> v) {
7101 const Simd<uint16_t, N, 0> d;
7102 const RepartitionToWide<decltype(d)> d32;
7103 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
7104 const auto odd = ShiftRight<16>(BitCast(d32, v));
7105 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
7106 // Also broadcast into odd lanes.
7107 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
7108}
7109template <size_t N, HWY_IF_GE32(int16_t, N)>
7110HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
7111 Vec128<int16_t, N> v) {
7112 const Simd<int16_t, N, 0> d;
7113 const RepartitionToWide<decltype(d)> d32;
7114 // Sign-extend
7115 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
7116 const auto odd = ShiftRight<16>(BitCast(d32, v));
7117 const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
7118 // Also broadcast into odd lanes.
7119 return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
7120}
7121
7122// u8, N=8, N=16:
7124 const Full64<uint8_t> d;
7125 return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
7126}
7129 const Full128<uint8_t> d;
7131 return Set(d, static_cast<uint8_t>(GetLane(sums) & 0xFF));
7132}
7133
7134template <size_t N, HWY_IF_GE64(int8_t, N)>
7136 const Vec128<int8_t, N> v) {
7137 const DFromV<decltype(v)> d;
7138 const RebindToUnsigned<decltype(d)> du;
7139 const auto is_neg = v < Zero(d);
7140
7141 // Sum positive and negative lanes separately, then combine to get the result.
7142 const auto positive = SumsOf8(BitCast(du, IfThenZeroElse(is_neg, v)));
7143 const auto negative = SumsOf8(BitCast(du, IfThenElseZero(is_neg, Abs(v))));
7144 return Set(d, static_cast<int8_t>(GetLane(
7145 SumOfLanes(hwy::SizeTag<8>(), positive - negative)) &
7146 0xFF));
7147}
7148
7149#if HWY_TARGET <= HWY_SSE4
7152 using V = decltype(v);
7153 return Broadcast<0>(V{_mm_minpos_epu16(v.raw)});
7154}
7162 const Half<DFromV<decltype(v)>> d;
7163 Vec64<uint8_t> result =
7164 Min(MinOfLanes(tag, UpperHalf(d, v)), MinOfLanes(tag, LowerHalf(d, v)));
7165 return Combine(DFromV<decltype(v)>(), result, result);
7166}
7167
7169 const Vec128<uint16_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint16_t>()));
7170 return m - MinOfLanes(tag, m - v);
7171}
7173 const Vec64<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>()));
7174 return m - MinOfLanes(tag, m - v);
7175}
7177 const Vec128<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>()));
7178 return m - MinOfLanes(tag, m - v);
7179}
7180#elif HWY_TARGET == HWY_SSSE3
7181template <size_t N, HWY_IF_GE64(uint8_t, N)>
7183 const Vec128<uint8_t, N> v) {
7184 const DFromV<decltype(v)> d;
7185 const RepartitionToWide<decltype(d)> d16;
7186 const RepartitionToWide<decltype(d16)> d32;
7188 vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
7189 vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
7190 if (N > 8) {
7191 const RepartitionToWide<decltype(d32)> d64;
7192 vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
7193 }
7194 return vm;
7195}
7196
7197template <size_t N, HWY_IF_GE64(uint8_t, N)>
7198HWY_API Vec128<uint8_t, N> MinOfLanes(hwy::SizeTag<1> /* tag */,
7199 const Vec128<uint8_t, N> v) {
7200 const DFromV<decltype(v)> d;
7201 const RepartitionToWide<decltype(d)> d16;
7202 const RepartitionToWide<decltype(d16)> d32;
7203 Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
7204 vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
7205 vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
7206 if (N > 8) {
7207 const RepartitionToWide<decltype(d32)> d64;
7208 vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
7209 }
7210 return vm;
7211}
7212#endif
7213
7214// Implement min/max of i8 in terms of u8 by toggling the sign bit.
7215template <size_t N, HWY_IF_GE64(int8_t, N)>
7217 const Vec128<int8_t, N> v) {
7218 const DFromV<decltype(v)> d;
7219 const RebindToUnsigned<decltype(d)> du;
7220 const auto mask = SignBit(du);
7221 const auto vu = Xor(BitCast(du, v), mask);
7222 return BitCast(d, Xor(MinOfLanes(tag, vu), mask));
7223}
7224template <size_t N, HWY_IF_GE64(int8_t, N)>
7226 const Vec128<int8_t, N> v) {
7227 const DFromV<decltype(v)> d;
7228 const RebindToUnsigned<decltype(d)> du;
7229 const auto mask = SignBit(du);
7230 const auto vu = Xor(BitCast(du, v), mask);
7231 return BitCast(d, Xor(MaxOfLanes(tag, vu), mask));
7232}
7233
7234template <size_t N, HWY_IF_GE32(uint16_t, N)>
7237 const Simd<uint16_t, N, 0> d;
7238 const RepartitionToWide<decltype(d)> d32;
7239 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
7240 const auto odd = ShiftRight<16>(BitCast(d32, v));
7241 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
7242 // Also broadcast into odd lanes.
7243 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
7244}
7245template <size_t N, HWY_IF_GE32(int16_t, N)>
7246HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
7247 Vec128<int16_t, N> v) {
7248 const Simd<int16_t, N, 0> d;
7249 const RepartitionToWide<decltype(d)> d32;
7250 // Sign-extend
7251 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
7252 const auto odd = ShiftRight<16>(BitCast(d32, v));
7253 const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
7254 // Also broadcast into odd lanes.
7255 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
7256}
7257
7258template <size_t N, HWY_IF_GE32(uint16_t, N)>
7259HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
7260 Vec128<uint16_t, N> v) {
7261 const Simd<uint16_t, N, 0> d;
7262 const RepartitionToWide<decltype(d)> d32;
7263 const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
7264 const auto odd = ShiftRight<16>(BitCast(d32, v));
7265 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
7266 // Also broadcast into odd lanes.
7267 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
7268}
7269template <size_t N, HWY_IF_GE32(int16_t, N)>
7270HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
7271 Vec128<int16_t, N> v) {
7272 const Simd<int16_t, N, 0> d;
7273 const RepartitionToWide<decltype(d)> d32;
7274 // Sign-extend
7275 const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
7276 const auto odd = ShiftRight<16>(BitCast(d32, v));
7277 const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
7278 // Also broadcast into odd lanes.
7279 return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
7280}
7281
7282} // namespace detail
7283
7284// Supported for u/i/f 32/64. Returns the same value in each lane.
7285template <typename T, size_t N>
7286HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7287 return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7288}
7289template <typename T, size_t N>
7290HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7291 return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7292}
7293template <typename T, size_t N>
7294HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
7295 return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
7296}
7297
7298// ------------------------------ Lt128
7299
7300namespace detail {
7301
7302// Returns vector-mask for Lt128. Also used by x86_256/x86_512.
7303template <class D, class V = VFromD<D>>
7304HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
7305 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
7306 "D must be u64");
7307 // Truth table of Eq and Lt for Hi and Lo u64.
7308 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
7309 // =H =L cH cL | out = cH | (=H & cL)
7310 // 0 0 0 0 | 0
7311 // 0 0 0 1 | 0
7312 // 0 0 1 0 | 1
7313 // 0 0 1 1 | 1
7314 // 0 1 0 0 | 0
7315 // 0 1 0 1 | 0
7316 // 0 1 1 0 | 1
7317 // 1 0 0 0 | 0
7318 // 1 0 0 1 | 1
7319 // 1 1 0 0 | 0
7320 const auto eqHL = Eq(a, b);
7321 const V ltHL = VecFromMask(d, Lt(a, b));
7322 const V ltLX = ShiftLeftLanes<1>(ltHL);
7323 const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
7324 return InterleaveUpper(d, vecHx, vecHx);
7325}
7326
7327// Returns vector-mask for Eq128. Also used by x86_256/x86_512.
7328template <class D, class V = VFromD<D>>
7329HWY_INLINE V Eq128Vec(const D d, const V a, const V b) {
7330 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
7331 "D must be u64");
7332 const auto eqHL = VecFromMask(d, Eq(a, b));
7333 const auto eqLH = Reverse2(d, eqHL);
7334 return And(eqHL, eqLH);
7335}
7336
7337template <class D, class V = VFromD<D>>
7338HWY_INLINE V Ne128Vec(const D d, const V a, const V b) {
7339 static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
7340 "D must be u64");
7341 const auto neHL = VecFromMask(d, Ne(a, b));
7342 const auto neLH = Reverse2(d, neHL);
7343 return Or(neHL, neLH);
7344}
7345
7346template <class D, class V = VFromD<D>>
7347HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
7348 // No specialization required for AVX-512: Mask <-> Vec is fast, and
7349 // copying mask bits to their neighbor seems infeasible.
7350 const V ltHL = VecFromMask(d, Lt(a, b));
7351 return InterleaveUpper(d, ltHL, ltHL);
7352}
7353
7354template <class D, class V = VFromD<D>>
7355HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) {
7356 // No specialization required for AVX-512: Mask <-> Vec is fast, and
7357 // copying mask bits to their neighbor seems infeasible.
7358 const V eqHL = VecFromMask(d, Eq(a, b));
7359 return InterleaveUpper(d, eqHL, eqHL);
7360}
7361
7362template <class D, class V = VFromD<D>>
7363HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) {
7364 // No specialization required for AVX-512: Mask <-> Vec is fast, and
7365 // copying mask bits to their neighbor seems infeasible.
7366 const V neHL = VecFromMask(d, Ne(a, b));
7367 return InterleaveUpper(d, neHL, neHL);
7368}
7369
7370} // namespace detail
7371
7372template <class D, class V = VFromD<D>>
7373HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
7374 return MaskFromVec(detail::Lt128Vec(d, a, b));
7375}
7376
7377template <class D, class V = VFromD<D>>
7378HWY_API MFromD<D> Eq128(D d, const V a, const V b) {
7379 return MaskFromVec(detail::Eq128Vec(d, a, b));
7380}
7381
7382template <class D, class V = VFromD<D>>
7383HWY_API MFromD<D> Ne128(D d, const V a, const V b) {
7384 return MaskFromVec(detail::Ne128Vec(d, a, b));
7385}
7386
7387template <class D, class V = VFromD<D>>
7388HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
7389 return MaskFromVec(detail::Lt128UpperVec(d, a, b));
7390}
7391
7392template <class D, class V = VFromD<D>>
7393HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) {
7394 return MaskFromVec(detail::Eq128UpperVec(d, a, b));
7395}
7396
7397template <class D, class V = VFromD<D>>
7398HWY_API MFromD<D> Ne128Upper(D d, const V a, const V b) {
7399 return MaskFromVec(detail::Ne128UpperVec(d, a, b));
7400}
7401
7402// ------------------------------ Min128, Max128 (Lt128)
7403
7404// Avoids the extra MaskFromVec in Lt128.
7405template <class D, class V = VFromD<D>>
7406HWY_API V Min128(D d, const V a, const V b) {
7407 return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
7408}
7409
7410template <class D, class V = VFromD<D>>
7411HWY_API V Max128(D d, const V a, const V b) {
7412 return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
7413}
7414
7415template <class D, class V = VFromD<D>>
7416HWY_API V Min128Upper(D d, const V a, const V b) {
7417 return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
7418}
7419
7420template <class D, class V = VFromD<D>>
7421HWY_API V Max128Upper(D d, const V a, const V b) {
7422 return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
7423}
7424
7425// NOLINTNEXTLINE(google-readability-namespace-comments)
7426} // namespace HWY_NAMESPACE
7427} // namespace hwy
7429
7430// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
7431// the warning seems to be issued at the call site of intrinsics, i.e. our code.
7432HWY_DIAGNOSTICS(pop)
#define HWY_RESTRICT
Definition base.h:64
#define HWY_IF_LANE_SIZE(T, bytes)
Definition base.h:420
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_IF_LE64(T, N)
Definition base.h:407
#define HWY_API
Definition base.h:129
#define HWY_IF_LE128(T, N)
Definition base.h:406
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_IS_MSAN
Definition base.h:202
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_MAYBE_UNUSED
Definition base.h:82
#define HWY_ASSERT(condition)
Definition base.h:192
Definition arm_neon-inl.h:825
Raw raw
Definition arm_neon-inl.h:835
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition arm_neon-inl.h:827
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition x86_128-inl.h:140
Definition arm_neon-inl.h:778
T PrivateT
Definition arm_neon-inl.h:782
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition x86_128-inl.h:82
typename detail::Raw128< T, N >::type Raw
Definition arm_neon-inl.h:779
Raw raw
Definition arm_neon-inl.h:814
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition x86_128-inl.h:88
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition x86_128-inl.h:97
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition x86_128-inl.h:94
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition x86_128-inl.h:79
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition x86_128-inl.h:91
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition x86_128-inl.h:85
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
#define HWY_SSSE3
Definition detect_targets.h:70
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:726
HWY_API Vec128< T, N > Shuffle2301(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2413
HWY_INLINE void MaybeUnpoison(T *HWY_RESTRICT unaligned, size_t count)
Definition x86_128-inl.h:648
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b)
Definition x86_128-inl.h:7355
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_INLINE Vec128< T, N > Abs(SignedTag, Vec128< T, N > a)
Definition emu128-inl.h:633
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition x86_128-inl.h:5571
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition x86_128-inl.h:5560
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition x86_128-inl.h:2205
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:535
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition x86_256-inl.h:2612
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition x86_128-inl.h:3286
HWY_INLINE T ExtractLane(const Vec128< T, N > v)
Definition wasm_128-inl.h:1688
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:3023
HWY_INLINE Vec128< T, N > InsertLane(const Vec128< T, N > v, T t)
Definition wasm_128-inl.h:1844
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition x86_128-inl.h:3275
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, N > Min(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:663
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b)
Definition x86_128-inl.h:7363
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:545
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b)
Definition x86_128-inl.h:7347
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3418
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3051
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > Max(hwy::NonFloatTag, Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:671
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:3147
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:2990
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
HWY_INLINE Vec128< T > ZeroExtendVector(hwy::NonFloatTag, Full128< T >, Vec64< T > lo)
Definition x86_128-inl.h:4567
constexpr uint64_t OnlyActive(uint64_t bits)
Definition arm_neon-inl.h:5589
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b)
Definition arm_sve-inl.h:3038
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition x86_256-inl.h:2604
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition x86_128-inl.h:3051
static bool SignBit(float f)
Definition scalar-inl.h:601
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_INLINE Mask128< T, N > Ne128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6685
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE Mask128< T, N > Eq128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6668
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5334
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_INLINE Mask128< T, N > Ne128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6677
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
Vec128< T, 4/sizeof(T)> Vec32
Definition arm_neon-inl.h:821
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition arm_neon-inl.h:4912
HWY_INLINE Mask128< T, N > Eq128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6660
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
HWY_API Vec128< T, 1 > CompressNot(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6198
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
typename D::Twice Twice
Definition ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition arm_neon-inl.h:5671
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:842
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
HWY_INLINE VFromD< D > Min128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6705
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6695
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5342
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
typename V::PrivateT TFromV
Definition arm_neon-inl.h:845
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:5407
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
HWY_INLINE VFromD< D > Max128Upper(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6710
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6623
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1462
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1085
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition arm_neon-inl.h:4256
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition ops/shared-inl.h:271
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition arm_neon-inl.h:6700
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:3497
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition arm_neon-inl.h:5338
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition arm_neon-inl.h:4712
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1225
HWY_INLINE Mask128< T, N > Lt128Upper(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:6651
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
Vec128< T, 8/sizeof(T)> Vec64
Definition arm_neon-inl.h:818
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
long long int GatherIndex64
Definition x86_128-inl.h:3268
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
typename D::T TFromD
Definition ops/shared-inl.h:203
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition arm_neon-inl.h:4977
HWY_API Vec128< T, 1 > Compress(Vec128< T, 1 > v, Mask128< T, 1 >)
Definition arm_neon-inl.h:6174
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition base.h:924
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition base.h:607
HWY_API void CopySameSize(const From *HWY_RESTRICT from, To *HWY_RESTRICT to)
Definition base.h:961
typename EnableIfT< Condition >::type EnableIf
Definition base.h:383
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
#define HWY_ATTR
Definition set_macros-inl.h:443
@ value
Definition arm_neon-inl.h:5730
Definition arm_neon-inl.h:3968
__m128i raw
Definition x86_128-inl.h:4131
detail::Raw128< T, N >::type raw
Definition arm_neon-inl.h:3969
Definition ops/shared-inl.h:52
HWY_INLINE __m128d operator()(__m128i v)
Definition x86_128-inl.h:187
HWY_INLINE __m128 operator()(__m128i v)
Definition x86_128-inl.h:183
HWY_INLINE __m128i operator()(__m128i v)
Definition x86_128-inl.h:179
__m128d type
Definition x86_128-inl.h:64
__f32x4 type
Definition wasm_128-inl.h:65
Definition x86_128-inl.h:55
__v128_u type
Definition wasm_128-inl.h:61
__mmask16 type
Definition x86_128-inl.h:119
__mmask8 type
Definition x86_128-inl.h:123
__mmask8 type
Definition x86_128-inl.h:127
__mmask8 type
Definition x86_128-inl.h:131
Definition x86_128-inl.h:116
Definition base.h:435
HWY_AFTER_NAMESPACE()
#define HWY_INLINE_F16
Definition x86_128-inl.h:5361
HWY_BEFORE_NAMESPACE()