65 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
66 __m256i zero = _mm256_setzero_si256();
67 __m256i mask = _mm256_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400,
68 0x0F0B07030E0A0602, 0x0D0905010C080400);
73 for ( ; count >= 32; count -= 32, sp += 32, p += 32)
75 __m256i a, t, u, v0, v1;
76 a = _mm256_load_si256((__m256i*)sp);
77 a = _mm256_max_epi32(a, zero);
78 t = _mm256_min_epi32(a, max_val_vec);
80 a = _mm256_load_si256((__m256i*)sp + 1);
81 a = _mm256_max_epi32(a, zero);
82 a = _mm256_min_epi32(a, max_val_vec);
83 a = _mm256_slli_epi32(a, 16);
84 t = _mm256_or_si256(t, a);
86 a = _mm256_load_si256((__m256i*)sp + 2);
87 a = _mm256_max_epi32(a, zero);
88 u = _mm256_min_epi32(a, max_val_vec);
90 a = _mm256_load_si256((__m256i*)sp + 3);
91 a = _mm256_max_epi32(a, zero);
92 a = _mm256_min_epi32(a, max_val_vec);
93 a = _mm256_slli_epi32(a, 16);
94 u = _mm256_or_si256(u, a);
96 v0 = _mm256_permute2x128_si256(t, u, 0x20);
97 v1 = _mm256_permute2x128_si256(t, u, 0x31);
98 v1 = _mm256_slli_epi32(v1, 8);
99 v0 = _mm256_or_si256(v0, v1);
101 v0 = _mm256_shuffle_epi8(v0, mask);
102 _mm256_storeu_si256((__m256i*)p, v0);
105 int max_val = (1 << bit_depth) - 1;
106 for ( ; count > 0; --count)
109 val = val >= 0 ? val : 0;
110 val = val <= max_val ? val : max_val;
120 int max_val = (1 << bit_depth) - 1;
121 __m256i max_val_vec = _mm256_set1_epi32(max_val);
122 __m256i zero = _mm256_setzero_si256();
123 __m256i m0 = _mm256_set_epi64x((
si64)0xFFFFFFFF0E0D0C0A,
124 (
si64)0x0908060504020100,
125 (
si64)0xFFFFFFFF0E0D0C0A,
126 (
si64)0x0908060504020100);
129 const __m256i* sp0 = (__m256i*)ln0->
i32;
130 const __m256i* sp1 = (__m256i*)ln1->
i32;
131 const __m256i* sp2 = (__m256i*)ln2->
i32;
133 for ( ; count >= 32; count -= 32, sp0 += 4, sp1 += 4, sp2 += 4, p += 96)
135 __m256i a, t, u, v, w;
137 a = _mm256_load_si256(sp0);
138 a = _mm256_max_epi32(a, zero);
139 t = _mm256_min_epi32(a, max_val_vec);
141 a = _mm256_load_si256(sp1);
142 a = _mm256_max_epi32(a, zero);
143 a = _mm256_min_epi32(a, max_val_vec);
144 a = _mm256_slli_epi32(a, 8);
145 t = _mm256_or_si256(t, a);
147 a = _mm256_load_si256(sp2);
148 a = _mm256_max_epi32(a, zero);
149 a = _mm256_min_epi32(a, max_val_vec);
150 a = _mm256_slli_epi32(a, 16);
151 t = _mm256_or_si256(t, a);
152 t = _mm256_shuffle_epi8(t, m0);
155 a = _mm256_load_si256(sp0 + 1);
156 a = _mm256_max_epi32(a, zero);
157 u = _mm256_min_epi32(a, max_val_vec);
159 a = _mm256_load_si256(sp1 + 1);
160 a = _mm256_max_epi32(a, zero);
161 a = _mm256_min_epi32(a, max_val_vec);
162 a = _mm256_slli_epi32(a, 8);
163 u = _mm256_or_si256(u, a);
165 a = _mm256_load_si256(sp2 + 1);
166 a = _mm256_max_epi32(a, zero);
167 a = _mm256_min_epi32(a, max_val_vec);
168 a = _mm256_slli_epi32(a, 16);
169 u = _mm256_or_si256(u, a);
170 u = _mm256_shuffle_epi8(u, m0);
173 a = _mm256_load_si256(sp0 + 2);
174 a = _mm256_max_epi32(a, zero);
175 v = _mm256_min_epi32(a, max_val_vec);
177 a = _mm256_load_si256(sp1 + 2);
178 a = _mm256_max_epi32(a, zero);
179 a = _mm256_min_epi32(a, max_val_vec);
180 a = _mm256_slli_epi32(a, 8);
181 v = _mm256_or_si256(v, a);
183 a = _mm256_load_si256(sp2 + 2);
184 a = _mm256_max_epi32(a, zero);
185 a = _mm256_min_epi32(a, max_val_vec);
186 a = _mm256_slli_epi32(a, 16);
187 v = _mm256_or_si256(v, a);
188 v = _mm256_shuffle_epi8(v, m0);
191 a = _mm256_load_si256(sp0 + 3);
192 a = _mm256_max_epi32(a, zero);
193 w = _mm256_min_epi32(a, max_val_vec);
195 a = _mm256_load_si256(sp1 + 3);
196 a = _mm256_max_epi32(a, zero);
197 a = _mm256_min_epi32(a, max_val_vec);
198 a = _mm256_slli_epi32(a, 8);
199 w = _mm256_or_si256(w, a);
201 a = _mm256_load_si256(sp2 + 3);
202 a = _mm256_max_epi32(a, zero);
203 a = _mm256_min_epi32(a, max_val_vec);
204 a = _mm256_slli_epi32(a, 16);
205 w = _mm256_or_si256(w, a);
206 w = _mm256_shuffle_epi8(w, m0);
208 _mm_storeu_si128((__m128i*)(p ), _mm256_castsi256_si128(t));
209 _mm_storeu_si128((__m128i*)(p + 12), _mm256_extracti128_si256(t,1));
210 _mm_storeu_si128((__m128i*)(p + 24), _mm256_castsi256_si128(u));
211 _mm_storeu_si128((__m128i*)(p + 36), _mm256_extracti128_si256(u,1));
212 _mm_storeu_si128((__m128i*)(p + 48), _mm256_castsi256_si128(v));
213 _mm_storeu_si128((__m128i*)(p + 60), _mm256_extracti128_si256(v,1));
214 _mm_storeu_si128((__m128i*)(p + 72), _mm256_castsi256_si128(w));
215#ifdef OJPH_ARCH_X86_64
216 *((
si64*)(p + 84)) = _mm256_extract_epi64(w, 2);
217#elif (defined OJPH_ARCH_I386)
218 *((
si32*)(p + 84)) = _mm256_extract_epi32(w, 4);
219 *((
si32*)(p + 88)) = _mm256_extract_epi32(w, 5);
221 #error Error unsupport compiler
223 *((
si32*)(p + 92)) = _mm256_extract_epi32(w, 6);
249 for ( ; count > 0; --count)
253 val = val >= 0 ? val : 0;
254 val = val <= max_val ? val : max_val;
257 val = val >= 0 ? val : 0;
258 val = val <= max_val ? val : max_val;
261 val = val >= 0 ? val : 0;
262 val = val <= max_val ? val : max_val;
275 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
276 __m256i zero = _mm256_setzero_si256();
277 __m256i mask = _mm256_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100,
278 0x0F0E0B0A07060302, 0x0D0C090805040100);
283 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
286 a = _mm256_load_si256((__m256i*)sp);
287 a = _mm256_max_epi32(a, zero);
288 t = _mm256_min_epi32(a, max_val_vec);
290 a = _mm256_load_si256((__m256i*)sp + 1);
291 a = _mm256_max_epi32(a, zero);
292 a = _mm256_min_epi32(a, max_val_vec);
293 a = _mm256_slli_epi32(a, 16);
294 t = _mm256_or_si256(t, a);
296 t = _mm256_shuffle_epi8(t, mask);
297 t = _mm256_permute4x64_epi64(t, 0xD8);
298 _mm256_storeu_si256((__m256i*)p, t);
301 int max_val = (1<<bit_depth) - 1;
302 for ( ; count > 0; --count)
305 val = val >= 0 ? val : 0;
306 val = val <= max_val ? val : max_val;
319 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
320 __m256i zero = _mm256_setzero_si256();
321 __m256i mask = _mm256_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001,
322 0x0E0F0A0B06070203, 0x0C0D080904050001);
327 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
330 a = _mm256_load_si256((__m256i*)sp);
331 a = _mm256_max_epi32(a, zero);
332 t = _mm256_min_epi32(a, max_val_vec);
334 a = _mm256_load_si256((__m256i*)sp + 1);
335 a = _mm256_max_epi32(a, zero);
336 a = _mm256_min_epi32(a, max_val_vec);
337 a = _mm256_slli_epi32(a, 16);
338 t = _mm256_or_si256(t, a);
340 t = _mm256_shuffle_epi8(t, mask);
341 t = _mm256_permute4x64_epi64(t, 0xD8);
342 _mm256_storeu_si256((__m256i*)p, t);
345 int max_val = (1<<bit_depth) - 1;
346 for ( ; count > 0; --count)
349 val = val >= 0 ? val : 0;
350 val = val <= max_val ? val : max_val;