52 float *y,
float *cb,
float *cr,
ui32 repeat)
59 for (
int i = (repeat + 7) >> 3; i > 0; --i)
61 __m256 mr = _mm256_load_ps(r);
62 __m256 mb = _mm256_load_ps(b);
63 __m256 my = _mm256_mul_ps(alpha_rf, mr);
64 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_gf, _mm256_load_ps(g)));
65 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_bf, mb));
66 _mm256_store_ps(y, my);
67 _mm256_store_ps(cb, _mm256_mul_ps(beta_cbf, _mm256_sub_ps(mb, my)));
68 _mm256_store_ps(cr, _mm256_mul_ps(beta_crf, _mm256_sub_ps(mr, my)));
70 r += 8; g += 8; b += 8;
71 y += 8; cb += 8; cr += 8;
77 float *r,
float *g,
float *b,
ui32 repeat)
83 for (
int i = (repeat + 7) >> 3; i > 0; --i)
85 __m256 my = _mm256_load_ps(y);
86 __m256 mcr = _mm256_load_ps(cr);
87 __m256 mcb = _mm256_load_ps(cb);
88 __m256 mg = _mm256_sub_ps(my, _mm256_mul_ps(gamma_cr2g, mcr));
89 _mm256_store_ps(g, _mm256_sub_ps(mg, _mm256_mul_ps(gamma_cb2g, mcb)));
90 _mm256_store_ps(r, _mm256_add_ps(my, _mm256_mul_ps(gamma_cr2r, mcr)));
91 _mm256_store_ps(b, _mm256_add_ps(my, _mm256_mul_ps(gamma_cb2b, mcb)));
93 y += 8; cb += 8; cr += 8;
94 r += 8; g += 8; b += 8;
void avx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void avx_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)