Skip to content
This repository has been archived by the owner on Jul 29, 2024. It is now read-only.

AVX512_changes #478

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
6 changes: 5 additions & 1 deletion Source/Lib/ASM_AVX2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ link_directories(${PROJECT_SOURCE_DIR}/Source/Lib/ASM_SSSE3/)

set(flags_to_test
-mavx2
-static-intel
-mavx512bw
-mavx512vnni
-mavx512vl
-static-intel
/Qwd10148
/Qwd10010
/Qwd10157)
Expand Down Expand Up @@ -79,6 +82,7 @@ set(ASM_AVX2_SOURCE
EbNoiseExtractAVX2.c
EbPackUnPack_Intrinsic_AVX2.c
EbPictureOperators_Intrinsic_AVX2.c
EbTransforms_Intrinsic_AVX512.c
EbTransforms_Intrinsic_AVX2.c)

if(COMPILE_AS_CPP)
Expand Down
25 changes: 14 additions & 11 deletions Source/Lib/ASM_AVX2/EbComputeSAD_SadLoopKernel_AVX512.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ void SadLoopKernel_AVX512_HmeL0_INTRIN(

case 16:
{
__m512i x = _mm512_setr_epi64(0x0000000000000000, 0x0001000100010001, 0x0004000400040004, 0x0005000500050005, 0x0001000100010001, 0x0002000200020002, 0x0005000500050005, 0x0006000600060006);
__m512i x1 = _mm512_setr_epi64(0x0001000100010001, 0x0002000200020002, 0x0005000500050005, 0x0006000600060006, 0x0002000200020002, 0x0003000300030003, 0x0006000600060006, 0x0007000700070007);
if (height <= 16 && searchAreaWidth <= 128)
{
for (i = 0; i<searchAreaHeight; i++)
Expand Down Expand Up @@ -243,17 +245,18 @@ void SadLoopKernel_AVX512_HmeL0_INTRIN(

for (j = 0, pRef = pRef1; j < n; j++, pRef += 16)
{
__m256i ss0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef ))), _mm_loadu_si128((__m128i*)(pRef + refStride )), 0x1);
__m256i ss1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8 ))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8 )), 0x1);
ss2 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 16))), _mm_loadu_si128((__m128i*)(pRef + refStride + 16)), 0x1);

__m512i ss1ftemp = _mm512_inserti64x4(_mm512_castsi256_si512(ss0), ss1, 0x1);
__m512i ss2ftemp = _mm512_inserti64x4(_mm512_castsi256_si512(ss1), ss2, 0x1);

ss3sum1_aaray[j] = _mm512_adds_epu16(ss3sum1_aaray[j], _mm512_dbsad_epu8(ref1ftemp, ss1ftemp, 0x94));
ss3sum1_aaray[j] = _mm512_adds_epu16(ss3sum1_aaray[j], _mm512_dbsad_epu8(ref2ftemp, ss1ftemp, 0xE9));
ss7sum1_aaray[j] = _mm512_adds_epu16(ss7sum1_aaray[j], _mm512_dbsad_epu8(ref3ftemp, ss2ftemp, 0x94));
ss7sum1_aaray[j] = _mm512_adds_epu16(ss7sum1_aaray[j], _mm512_dbsad_epu8(ref4ftemp, ss2ftemp, 0xE9));
__m512i ss0 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_loadu_si256((__m256i*)(pRef))), _mm256_loadu_si256((__m256i*)(pRef + refStride )), 0x1);
__m512i ss1ftemp = _mm512_permutexvar_epi64(x, ss0);
__m512i ss2ftemp = _mm512_permutexvar_epi64(x1, ss0);
__m512i temp = ss3sum1_aaray[j];
__m512i temp1 = ss7sum1_aaray[j];

temp = _mm512_adds_epu16(temp, _mm512_dbsad_epu8(ref1ftemp, ss1ftemp, 0x94));
temp = _mm512_adds_epu16(temp, _mm512_dbsad_epu8(ref2ftemp, ss1ftemp, 0xE9));
temp1 = _mm512_adds_epu16(temp1, _mm512_dbsad_epu8(ref3ftemp, ss2ftemp, 0x94));
temp1 = _mm512_adds_epu16(temp1, _mm512_dbsad_epu8(ref4ftemp, ss2ftemp, 0xE9));
ss3sum1_aaray[j] = temp;
ss7sum1_aaray[j] = temp1;

}

Expand Down
8 changes: 8 additions & 0 deletions Source/Lib/ASM_AVX2/EbTransforms_AVX2.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
extern "C" {
#endif

#ifdef NON_AVX512_SUPPORT
#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN
#else
#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX512_INTRIN
#endif



void QuantizeInvQuantize8x8_AVX2_INTRIN(
EB_S16 *coeff,
const EB_U32 coeffStride,
Expand Down
134 changes: 76 additions & 58 deletions Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ void EbHevcTransform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst,

for (i = 0; i < 16; i++)
{
__m256i x0, x1, x2, x3;
__m256i x0, x1, x2, x3,sox0,sox5,soxa,soxf,s1x0,s1x5,s1xa,s1xf;
__m256i y0, y1, y2, y3;
__m256i a0, a1, a2, a3, a4, a5, a6, a7;
__m256i b0, b1, b2, b3, b4, b5, b6, b7;
Expand Down Expand Up @@ -652,61 +652,79 @@ void EbHevcTransform32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst,
x2 = y2;
x3 = y3;

a0 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[0]);
a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[2]));
a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[4]));
a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[6]));

a1 = _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x00), coeff32[1]);
a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0x55), coeff32[3]));
a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xaa), coeff32[5]));
a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(_mm256_shuffle_epi32(x0, 0xff), coeff32[7]));

a2 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[8]);
a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[10]));
a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[12]));
a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[14]));

a3 = _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x00), coeff32[9]);
a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0x55), coeff32[11]));
a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xaa), coeff32[13]));
a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(_mm256_shuffle_epi32(x1, 0xff), coeff32[15]));

a4 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[16]);
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[20]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[24]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[28]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[32]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[36]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[40]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[44]));

a5 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[17]);
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[21]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[25]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[29]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[33]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[37]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[41]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[45]));

a6 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[18]);
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[22]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[26]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[30]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[34]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[38]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[42]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[46]));

a7 = _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x00), coeff32[19]);
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0x55), coeff32[23]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xaa), coeff32[27]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x2, 0xff), coeff32[31]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x00), coeff32[35]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0x55), coeff32[39]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xaa), coeff32[43]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(_mm256_shuffle_epi32(x3, 0xff), coeff32[47]));
sox0 = _mm256_shuffle_epi32(x0, 0x00);
sox5 = _mm256_shuffle_epi32(x0, 0x55);
soxa = _mm256_shuffle_epi32(x0, 0xaa);
soxf = _mm256_shuffle_epi32(x0, 0xff);
s1x0 = _mm256_shuffle_epi32(x1, 0x00);
s1x5 = _mm256_shuffle_epi32(x1, 0x55);
s1xa = _mm256_shuffle_epi32(x1, 0xaa);
s1xf = _mm256_shuffle_epi32(x1, 0xff);

a0 = _mm256_madd_epi16(sox0, coeff32[0]);
a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(sox5, coeff32[2]));
a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxa, coeff32[4]));
a0 = _mm256_add_epi32(a0, _mm256_madd_epi16(soxf, coeff32[6]));

a1 = _mm256_madd_epi16(sox0, coeff32[1]);
a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(sox5, coeff32[3]));
a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxa, coeff32[5]));
a1 = _mm256_add_epi32(a1, _mm256_madd_epi16(soxf, coeff32[7]));

a2 = _mm256_madd_epi16(s1x0, coeff32[8]);
a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1x5, coeff32[10]));
a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xa, coeff32[12]));
a2 = _mm256_add_epi32(a2, _mm256_madd_epi16(s1xf, coeff32[14]));

a3 = _mm256_madd_epi16(s1x0, coeff32[9]);
a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1x5, coeff32[11]));
a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xa, coeff32[13]));
a3 = _mm256_add_epi32(a3, _mm256_madd_epi16(s1xf, coeff32[15]));

sox0 = _mm256_shuffle_epi32(x2, 0x00);
sox5 = _mm256_shuffle_epi32(x2, 0x55);
soxa = _mm256_shuffle_epi32(x2, 0xaa);
soxf = _mm256_shuffle_epi32(x2, 0xff);
s1x0 = _mm256_shuffle_epi32(x3, 0x00);
s1x5 = _mm256_shuffle_epi32(x3, 0x55);
s1xa = _mm256_shuffle_epi32(x3, 0xaa);
s1xf = _mm256_shuffle_epi32(x3, 0xff);

a4 = _mm256_madd_epi16(sox0, coeff32[16]);
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(sox5, coeff32[20]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxa, coeff32[24]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(soxf, coeff32[28]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x0, coeff32[32]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1x5, coeff32[36]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xa, coeff32[40]));
a4 = _mm256_add_epi32(a4, _mm256_madd_epi16(s1xf, coeff32[44]));

a5 = _mm256_madd_epi16(sox0, coeff32[17]);
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(sox5, coeff32[21]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxa, coeff32[25]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(soxf, coeff32[29]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x0, coeff32[33]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1x5, coeff32[37]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xa, coeff32[41]));
a5 = _mm256_add_epi32(a5, _mm256_madd_epi16(s1xf, coeff32[45]));

a6 = _mm256_madd_epi16(sox0, coeff32[18]);
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(sox5, coeff32[22]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxa, coeff32[26]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(soxf, coeff32[30]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x0, coeff32[34]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1x5, coeff32[38]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xa, coeff32[42]));
a6 = _mm256_add_epi32(a6, _mm256_madd_epi16(s1xf, coeff32[46]));

a7 = _mm256_madd_epi16(sox0, coeff32[19]);
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(sox5, coeff32[23]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxa, coeff32[27]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(soxf, coeff32[31]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x0, coeff32[35]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1x5, coeff32[39]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xa, coeff32[43]));
a7 = _mm256_add_epi32(a7, _mm256_madd_epi16(s1xf, coeff32[47]));

b0 = _mm256_sra_epi32(_mm256_add_epi32(a0, o0), s0);
b1 = _mm256_sra_epi32(_mm256_add_epi32(a1, o0), s0);
Expand Down Expand Up @@ -1468,9 +1486,9 @@ EB_EXTERN void lowPrecisionTransform16x16_AVX2_INTRIN(EB_S16 *src, EB_U32 src_st
// forward 32x32 transform
EB_EXTERN void lowPrecisionTransform32x32_AVX2_INTRIN(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift)
{
EbHevcTransform32_AVX2_INTRIN(src, src_stride, intermediate, 32, 6 + addshift);
EbHevcTransform32_INTRIN(src, src_stride, intermediate, 32, 6 + addshift);
EbHevcTranspose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride);
EbHevcTransform32_AVX2_INTRIN(dst, dst_stride, intermediate, 32, 9);
EbHevcTransform32_INTRIN(dst, dst_stride, intermediate, 32, 9);
EbHevcTranspose32_AVX2_INTRIN(intermediate, 32, dst, dst_stride);
}

Expand Down
Loading