79 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
80 vlc_src_table tbl0[] = {
83 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
85 si32 pattern_popcnt[16];
86 for (
ui32 i = 0; i < 16; ++i)
89 vlc_src_table* src_tbl = tbl0;
91 size_t tbl_size = tbl0_size;
92 for (
int i = 0; i < 2048; ++i)
94 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
95 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
99 vlc_src_table *best_entry = NULL;
103 for (
size_t j = 0; j < tbl_size; ++j)
105 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
106 if (src_tbl[j].u_off == 1)
107 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
111 int ones_count = pattern_popcnt[src_tbl[j].e_k];
112 if (ones_count >= best_e_k)
114 best_entry = src_tbl + j;
115 best_e_k = ones_count;
122 for (
size_t j = 0; j < tbl_size; ++j)
124 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
125 if (src_tbl[j].u_off == 0)
127 best_entry = src_tbl + j;
133 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
138 vlc_src_table tbl1[] = {
141 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
145 tbl_size = tbl1_size;
146 for (
int i = 0; i < 2048; ++i)
148 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
149 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
153 vlc_src_table *best_entry = NULL;
157 for (
size_t j = 0; j < tbl_size; ++j)
159 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
160 if (src_tbl[j].u_off == 1)
161 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
165 int ones_count = pattern_popcnt[src_tbl[j].e_k];
166 if (ones_count >= best_e_k)
168 best_entry = src_tbl + j;
169 best_e_k = ones_count;
176 for (
size_t j = 0; j < tbl_size; ++j)
178 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
179 if (src_tbl[j].u_off == 0)
181 best_entry = src_tbl + j;
187 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
503 __m256i *eq_vec, __m256i *s_vec,
504 __m256i &rho_vec, __m256i &e_qmax_vec)
511 for (
ui32 i = 0; i < 4; ++i) {
513 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
516 val_vec[i] = _mm256_srli_epi32(val_vec[i], (
int)p);
519 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((
int)~1u));
529 val_vec[i] = _mm256_sub_epi32(val_vec[i],
ONE);
531 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
538 val_vec[i] = _mm256_sub_epi32(val_vec[i],
ONE);
539 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
540 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
542 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
543 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
544 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
548 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
562 for (
ui32 i = 0; i < 2; ++i) {
563 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
564 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
565 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
566 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
568 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
569 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
570 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
571 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
573 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
574 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
575 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
576 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
579 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
580 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
581 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
582 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
583 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
584 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
585 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
586 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
587 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
605 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
606 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
607 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
608 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
610 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
611 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
612 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
613 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
615 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
616 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
619 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
620 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
634 auto tmp = _mm256_and_si256(tuple_vec,
ONE);
635 tmp = _mm256_sub_epi32(uq_vec, tmp);
636 auto tmp1 = _mm256_and_si256(rho_vec,
ONE);
638 m_vec[0] = _mm256_and_si256(mask, tmp);
641 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
642 tmp = _mm256_srli_epi32(tmp, 1);
643 tmp = _mm256_sub_epi32(uq_vec, tmp);
644 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
646 m_vec[1] = _mm256_and_si256(mask, tmp);
649 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
650 tmp = _mm256_srli_epi32(tmp, 2);
651 tmp = _mm256_sub_epi32(uq_vec, tmp);
652 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
654 m_vec[2] = _mm256_and_si256(mask, tmp);
657 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
658 tmp = _mm256_srli_epi32(tmp, 3);
659 tmp = _mm256_sub_epi32(uq_vec, tmp);
660 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
662 m_vec[3] = _mm256_and_si256(mask, tmp);
684 for (
ui32 i = 0; i < 4; ++i) {
688 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
689 tmp = _mm256_sllv_epi32(
ONE, m_vec[i]);
690 tmp = _mm256_sub_epi32(tmp,
ONE);
691 tmp = _mm256_and_si256(tmp, s_vec[i]);
692 _mm256_storeu_si256((__m256i*)cwd, tmp);
694 for (
ui32 j = 0; j < 4; ++j) {
697 _cwd_len = cwd_len[idx];
698 _cwd |= ((
ui64)cwd[idx + 1]) << _cwd_len;
699 _cwd_len += cwd_len[idx + 1];
715 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec,
ZERO);
717 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
718 auto eps_vec = _mm256_srli_epi32(mask, 31);
720 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
721 auto tmp = _mm256_srli_epi32(mask, 31);
722 tmp = _mm256_slli_epi32(tmp, 1);
723 eps_vec = _mm256_or_si256(eps_vec, tmp);
725 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
726 tmp = _mm256_srli_epi32(mask, 31);
727 tmp = _mm256_slli_epi32(tmp, 2);
728 eps_vec = _mm256_or_si256(eps_vec, tmp);
730 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
731 tmp = _mm256_srli_epi32(mask, 31);
732 tmp = _mm256_slli_epi32(tmp, 3);
733 eps_vec = _mm256_or_si256(eps_vec, tmp);
735 return _mm256_and_si256(u_q_mask, eps_vec);
754 __m256i &rho_vec, __m256i *cx_val_vec,
755 const __m256i left_shift)
761 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
762 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
763 prev_cx_val_vec = _mm256_insert_epi32(
ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
765 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
766 tmp = _mm256_srli_epi32(tmp, 3);
768 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
769 tmp1 = _mm256_srli_epi32(tmp1, 1);
770 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
798 const __m256i right_shift)
802 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
803 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
805#ifdef OJPH_ARCH_X86_64
806 tmp = _mm256_insert_epi64(tmp,
807 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
808#elif (defined OJPH_ARCH_I386)
809 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
810 tmp = _mm256_insert_epi32(tmp, lsb, 6);
811 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
812 tmp = _mm256_insert_epi32(tmp, msb, 7);
814 #error Error unsupport compiler
816 tmp = _mm256_slli_epi32(tmp, 2);
817 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
818 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
819 tmp = _mm256_add_epi32(tmp1, tmp);
821 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
822 tmp1 = _mm256_srli_epi32(tmp1, 1);
823 tmp = _mm256_or_si256(tmp, tmp1);
825 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
826 tmp1 = _mm256_srli_epi32(tmp1, 2);
828 return _mm256_or_si256(tmp, tmp1);
834 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
835 const __m256i right_shift)
837 int32_t mel_need_encode[8];
838 int32_t mel_need_encode2[8];
843 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec,
ZERO));
849 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
850 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
851 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
854 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec,
ZERO);
855 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp,
ZERO)));
857 ui32 i_max = 8 - (ignore / 2);
859 for (
ui32 i = 0; i < i_max; i += 2) {
860 if (mel_need_encode[i]) {
865 if (mel_need_encode[i + 1]) {
870 if (mel_need_encode2[i]) {
1018 ui32 width = (_width + 15) & ~15u;
1019 ui32 ignore = width - _width;
1020 const int ms_size = (16384 * 16 + 14) / 15;
1021 const int mel_vlc_size = 3072;
1022 const int mel_size = 192;
1023 const int vlc_size = mel_vlc_size - mel_size;
1025 ui8 ms_buf[ms_size];
1026 ui8 mel_vlc_buf[mel_vlc_size];
1027 ui8 *mel_buf = mel_vlc_buf;
1028 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1035 ms_init(&ms, ms_size, ms_buf);
1037 const ui32 p = 30 - missing_msbs;
1048 const __m256i right_shift = _mm256_set_epi32(
1049 0, 7, 6, 5, 4, 3, 2, 1
1052 const __m256i left_shift = _mm256_set_epi32(
1053 6, 5, 4, 3, 2, 1, 0, 7
1056 ui32 n_loop = (width + 15) / 16;
1058 __m256i e_val_vec[65];
1060 e_val_vec[i] =
ZERO;
1062 __m256i prev_e_val_vec =
ZERO;
1064 __m256i cx_val_vec[65];
1065 __m256i prev_cx_val_vec =
ZERO;
1079 for (
ui32 y = 0; y < height; y += 2)
1081 e_val_vec[n_loop] = prev_e_val_vec;
1083 __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1084 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1086 prev_e_val_vec =
ZERO;
1087 prev_cx_val_vec =
ZERO;
1089 ui32 *sp = buf + y * stride;
1092 for (
ui32 x = 0; x < n_loop; ++x) {
1095 if ((x == (n_loop - 1)) && (_width % 16)) {
1096 ui32 tmp_buf[16] = { 0 };
1097 memcpy(tmp_buf, sp, (_width % 16) *
sizeof(
ui32));
1098 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1099 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1100 if (y + 1 < height) {
1101 memcpy(tmp_buf, sp + stride, (_width % 16) *
sizeof(
ui32));
1102 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1103 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1111 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1112 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1114 if (y + 1 < height) {
1115 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1116 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1131 __m256i rho_vec, e_qmax_vec;
1132 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1135 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1136 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1138 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1139 max_e_vec = _mm256_sub_epi32(max_e_vec,
ONE);
1142 tmp = _mm256_max_epi32(max_e_vec,
ONE);
1143 __m256i tmp1 = _mm256_sub_epi32(rho_vec,
ONE);
1144 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1146 auto cmp = _mm256_cmpeq_epi32(tmp1,
ZERO);
1147 auto kappa_vec1_ = _mm256_and_si256(cmp,
ONE);
1148 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1149 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1154 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1156 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1157 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1158 prev_cq = (
ui32)_mm256_extract_epi32(tmp, 7);
1160 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1161 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1165 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1166 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1168 auto eps_vec =
cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1169 __m256i tuple_vec =
cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1170 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1172 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1185 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1186 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1187 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1189 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1192 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1193 tmp = _mm256_slli_epi32(tmp, 2);
1194 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1195 prev_cq = (
ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1207 lengths[0] = mel.pos + vlc.
pos + ms.pos;
1209 memcpy(coded->
buf, ms.buf, ms.pos);
1210 memcpy(coded->
buf + ms.pos, mel.buf, mel.pos);
1211 memcpy(coded->
buf + ms.pos + mel.pos, vlc.
buf - vlc.
pos + 1, vlc.
pos);
1214 ui32 num_bytes = mel.pos + vlc.
pos;
1215 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1216 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1217 coded->
buf[lengths[0]-2] =
1218 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));