48 __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
49 x1 = _mm_shuffle_epi32(x0, 0xEE);
50 x0 = _mm_or_si128(x0, x1);
51 x1 = _mm_shuffle_epi32(x0, 0x55);
52 x0 = _mm_or_si128(x0, x1);
53 _mm_storeu_si128((__m128i*)address, x0);
79 float delta_inv,
ui32 count,
ui32* max_val)
84 ui32 shift = 31 - K_max;
85 __m128i m0 = _mm_set1_epi32(INT_MIN);
86 __m128i zero = _mm_setzero_si128();
87 __m128i one = _mm_set1_epi32(1);
88 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
89 __m128i *p = (__m128i*)sp;
90 for ( ; count >= 4; count -= 4, p += 1, dp += 4)
92 __m128i v = _mm_loadu_si128(p);
93 __m128i sign = _mm_cmplt_epi32(v, zero);
94 __m128i val = _mm_xor_si128(v, sign);
95 __m128i ones = _mm_and_si128(sign, one);
96 val = _mm_add_epi32(val, ones);
97 sign = _mm_and_si128(sign, m0);
98 val = _mm_slli_epi32(val, (
int)shift);
99 tmax = _mm_or_si128(tmax, val);
100 val = _mm_or_si128(val, sign);
101 _mm_storeu_si128((__m128i*)dp, val);
105 __m128i v = _mm_loadu_si128(p);
106 __m128i sign = _mm_cmplt_epi32(v, zero);
107 __m128i val = _mm_xor_si128(v, sign);
108 __m128i ones = _mm_and_si128(sign, one);
109 val = _mm_add_epi32(val, ones);
110 sign = _mm_and_si128(sign, m0);
111 val = _mm_slli_epi32(val, (
int)shift);
113 __m128i c = _mm_set1_epi32((
si32)count);
114 __m128i idx = _mm_set_epi32(3, 2, 1, 0);
115 __m128i mask = _mm_cmpgt_epi32(c, idx);
116 c = _mm_and_si128(val, mask);
117 tmax = _mm_or_si128(tmax, c);
119 val = _mm_or_si128(val, sign);
120 _mm_storeu_si128((__m128i*)dp, val);
122 _mm_storeu_si128((__m128i*)max_val, tmax);
127 float delta_inv,
ui32 count,
ui32* max_val)
133 __m128 d = _mm_set1_ps(delta_inv);
134 __m128i zero = _mm_setzero_si128();
135 __m128i one = _mm_set1_epi32(1);
136 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
137 float *p = (
float*)sp;
138 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
140 __m128 vf = _mm_loadu_ps(p);
141 vf = _mm_mul_ps(vf, d);
142 __m128i val = _mm_cvtps_epi32(vf);
143 __m128i sign = _mm_cmplt_epi32(val, zero);
144 val = _mm_xor_si128(val, sign);
145 __m128i ones = _mm_and_si128(sign, one);
146 val = _mm_add_epi32(val, ones);
147 tmax = _mm_or_si128(tmax, val);
148 sign = _mm_slli_epi32(sign, 31);
149 val = _mm_or_si128(val, sign);
150 _mm_storeu_si128((__m128i*)dp, val);
154 __m128 vf = _mm_loadu_ps(p);
155 vf = _mm_mul_ps(vf, d);
156 __m128i val = _mm_cvtps_epi32(vf);
157 __m128i sign = _mm_cmplt_epi32(val, zero);
158 val = _mm_xor_si128(val, sign);
159 __m128i ones = _mm_and_si128(sign, one);
160 val = _mm_add_epi32(val, ones);
162 __m128i c = _mm_set1_epi32((
si32)count);
163 __m128i idx = _mm_set_epi32(3, 2, 1, 0);
164 __m128i mask = _mm_cmpgt_epi32(c, idx);
165 c = _mm_and_si128(val, mask);
166 tmax = _mm_or_si128(tmax, c);
168 sign = _mm_slli_epi32(sign, 31);
169 val = _mm_or_si128(val, sign);
170 _mm_storeu_si128((__m128i*)dp, val);
172 _mm_storeu_si128((__m128i*)max_val, tmax);
177 float delta,
ui32 count)
180 ui32 shift = 31 - K_max;
181 __m128i m1 = _mm_set1_epi32(INT_MAX);
182 __m128i zero = _mm_setzero_si128();
183 __m128i one = _mm_set1_epi32(1);
185 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
187 __m128i v = _mm_load_si128((__m128i*)sp);
188 __m128i val = _mm_and_si128(v, m1);
189 val = _mm_srli_epi32(val, (
int)shift);
190 __m128i sign = _mm_cmplt_epi32(v, zero);
191 val = _mm_xor_si128(val, sign);
192 __m128i ones = _mm_and_si128(sign, one);
193 val = _mm_add_epi32(val, ones);
194 _mm_storeu_si128((__m128i*)p, val);
200 float delta,
ui32 count)
203 __m128i m1 = _mm_set1_epi32(INT_MAX);
204 __m128 d = _mm_set1_ps(delta);
205 float *p = (
float*)dp;
206 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
208 __m128i v = _mm_load_si128((__m128i*)sp);
209 __m128i vali = _mm_and_si128(v, m1);
210 __m128 valf = _mm_cvtepi32_ps(vali);
211 valf = _mm_mul_ps(valf, d);
212 __m128i sign = _mm_andnot_si128(m1, v);
213 valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
214 _mm_storeu_ps(p, valf);
220 float delta_inv,
ui32 count,
ui64* max_val)
225 ui32 shift = 63 - K_max;
226 __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
227 __m128i zero = _mm_setzero_si128();
228 __m128i one = _mm_set1_epi64x(1);
229 __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
230 __m128i *p = (__m128i*)sp;
231 for ( ; count >= 2; count -= 2, p += 1, dp += 2)
233 __m128i v = _mm_loadu_si128(p);
234 __m128i sign = _mm_cmplt_epi32(v, zero);
235 sign = _mm_shuffle_epi32(sign, 0xF5);
236 __m128i val = _mm_xor_si128(v, sign);
237 __m128i ones = _mm_and_si128(sign, one);
238 val = _mm_add_epi64(val, ones);
239 sign = _mm_and_si128(sign, m0);
240 val = _mm_slli_epi64(val, (
int)shift);
241 tmax = _mm_or_si128(tmax, val);
242 val = _mm_or_si128(val, sign);
243 _mm_storeu_si128((__m128i*)dp, val);
247 __m128i v = _mm_loadu_si128(p);
248 __m128i sign = _mm_cmplt_epi32(v, zero);
249 sign = _mm_shuffle_epi32(sign, 0xF5);
250 __m128i val = _mm_xor_si128(v, sign);
251 __m128i ones = _mm_and_si128(sign, one);
252 val = _mm_add_epi64(val, ones);
253 sign = _mm_and_si128(sign, m0);
254 val = _mm_slli_epi64(val, (
int)shift);
256 __m128i c = _mm_set_epi32(0, 0, (
si32)0xFFFFFFFF, (
si32)0xFFFFFFFF);
257 c = _mm_and_si128(val, c);
258 tmax = _mm_or_si128(tmax, c);
260 val = _mm_or_si128(val, sign);
261 _mm_storeu_si128((__m128i*)dp, val);
263 _mm_storeu_si128((__m128i*)max_val, tmax);
268 float delta,
ui32 count)
271 ui32 shift = 63 - K_max;
272 __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
273 __m128i zero = _mm_setzero_si128();
274 __m128i one = _mm_set1_epi64x(1);
276 for (
ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
278 __m128i v = _mm_load_si128((__m128i*)sp);
279 __m128i val = _mm_and_si128(v, m1);
280 val = _mm_srli_epi64(val, (
int)shift);
281 __m128i sign = _mm_cmplt_epi32(v, zero);
282 sign = _mm_shuffle_epi32(sign, 0xF5);
283 val = _mm_xor_si128(val, sign);
284 __m128i ones = _mm_and_si128(sign, one);
285 val = _mm_add_epi64(val, ones);
286 _mm_storeu_si128((__m128i*)p, val);