Skip to content
This repository has been archived by the owner on Jul 29, 2024. It is now read-only.

AVX512_changes #478

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ In order to run the highest resolution supported by the encoder, at least 64GB o
- Download the yasm exe from the following [link](http://www.tortall.net/projects/yasm/releases/yasm-1.3.0-win64.exe)
- Rename yasm-1.3.0-win64.exe to yasm.exe
- Copy yasm.exe into a location that is in the PATH environment variable
- Vnni requires gcc version >= 9.2.

- __Build Instructions__
- Build the project by following the steps below in a windows command prompt:
Expand Down
5 changes: 4 additions & 1 deletion Source/Lib/ASM_AVX2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ link_directories(${PROJECT_SOURCE_DIR}/Source/Lib/ASM_SSSE3/)

set(flags_to_test
-mavx2
-static-intel
-mavx512bw
-mavx512vnni
-mavx512vl
-static-intel
/Qwd10148
/Qwd10010
/Qwd10157)
Expand Down
25 changes: 14 additions & 11 deletions Source/Lib/ASM_AVX2/EbComputeSAD_SadLoopKernel_AVX512.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ void SadLoopKernel_AVX512_HmeL0_INTRIN(

case 16:
{
__m512i x = _mm512_setr_epi64(0x0000000000000000, 0x0001000100010001, 0x0004000400040004, 0x0005000500050005, 0x0001000100010001, 0x0002000200020002, 0x0005000500050005, 0x0006000600060006);
__m512i x1 = _mm512_setr_epi64(0x0001000100010001, 0x0002000200020002, 0x0005000500050005, 0x0006000600060006, 0x0002000200020002, 0x0003000300030003, 0x0006000600060006, 0x0007000700070007);
if (height <= 16 && searchAreaWidth <= 128)
{
for (i = 0; i<searchAreaHeight; i++)
Expand Down Expand Up @@ -243,17 +245,18 @@ void SadLoopKernel_AVX512_HmeL0_INTRIN(

for (j = 0, pRef = pRef1; j < n; j++, pRef += 16)
{
__m256i ss0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef ))), _mm_loadu_si128((__m128i*)(pRef + refStride )), 0x1);
__m256i ss1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8 ))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8 )), 0x1);
ss2 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 16))), _mm_loadu_si128((__m128i*)(pRef + refStride + 16)), 0x1);

__m512i ss1ftemp = _mm512_inserti64x4(_mm512_castsi256_si512(ss0), ss1, 0x1);
__m512i ss2ftemp = _mm512_inserti64x4(_mm512_castsi256_si512(ss1), ss2, 0x1);

ss3sum1_aaray[j] = _mm512_adds_epu16(ss3sum1_aaray[j], _mm512_dbsad_epu8(ref1ftemp, ss1ftemp, 0x94));
ss3sum1_aaray[j] = _mm512_adds_epu16(ss3sum1_aaray[j], _mm512_dbsad_epu8(ref2ftemp, ss1ftemp, 0xE9));
ss7sum1_aaray[j] = _mm512_adds_epu16(ss7sum1_aaray[j], _mm512_dbsad_epu8(ref3ftemp, ss2ftemp, 0x94));
ss7sum1_aaray[j] = _mm512_adds_epu16(ss7sum1_aaray[j], _mm512_dbsad_epu8(ref4ftemp, ss2ftemp, 0xE9));
__m512i ss0 = _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_loadu_si256((__m256i*)(pRef))), _mm256_loadu_si256((__m256i*)(pRef + refStride )), 0x1);
__m512i ss1ftemp = _mm512_permutexvar_epi64(x, ss0);
__m512i ss2ftemp = _mm512_permutexvar_epi64(x1, ss0);
__m512i temp = ss3sum1_aaray[j];
__m512i temp1 = ss7sum1_aaray[j];

temp = _mm512_adds_epu16(temp, _mm512_dbsad_epu8(ref1ftemp, ss1ftemp, 0x94));
temp = _mm512_adds_epu16(temp, _mm512_dbsad_epu8(ref2ftemp, ss1ftemp, 0xE9));
temp1 = _mm512_adds_epu16(temp1, _mm512_dbsad_epu8(ref3ftemp, ss2ftemp, 0x94));
temp1 = _mm512_adds_epu16(temp1, _mm512_dbsad_epu8(ref4ftemp, ss2ftemp, 0xE9));
ss3sum1_aaray[j] = temp;
ss7sum1_aaray[j] = temp1;

}

Expand Down
8 changes: 8 additions & 0 deletions Source/Lib/ASM_AVX2/EbTransforms_AVX2.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
extern "C" {
#endif

#ifdef VNNI_SUPPORT
#define EbHevcTransform32_INTRIN EbHevcTransform32_VNNI_INTRIN
#else
#define EbHevcTransform32_INTRIN EbHevcTransform32_AVX2_INTRIN
#endif



void QuantizeInvQuantize8x8_AVX2_INTRIN(
EB_S16 *coeff,
const EB_U32 coeffStride,
Expand Down
291 changes: 233 additions & 58 deletions Source/Lib/ASM_AVX2/EbTransforms_Intrinsic_AVX2.c

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions Source/Lib/ASM_SSSE3/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ include_directories(${PROJECT_SOURCE_DIR}/Source/API/

set(flags_to_test
-mssse3
-mavx512bw
-mavx512vnni
-mavx512vl
-msse4.1
-static-intel)

Expand All @@ -40,9 +43,9 @@ foreach(cflag ${flags_to_test})
endforeach()

if(CMAKE_C_COMPILER_ID STREQUAL "Intel")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w ")
if(COMPILE_AS_CPP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w ")
endif()
endif()

Expand All @@ -52,6 +55,7 @@ set(ASM_SSSE3_SOURCE
EbDeblockingFilter_Intrinsic_SSSE3.c
EbIntraPrediction16bit_Intrinsic_SSSE3.c
EbMcp_Intrinsic_SSSE3.c
EbMcp_Intrinsic_AVX512VNNI.c
EbSaoApplication_Intrinsic_SSSE3.c
EbTransforms_Intrinsic_SSSE3.c)

Expand Down
138 changes: 138 additions & 0 deletions Source/Lib/ASM_SSSE3/EbMcp_Intrinsic_AVX512VNNI.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include "EbMcp_SSSE3.h"
#include "EbDefinitions.h"

#include "immintrin.h"

#ifdef VNNI_SUPPORT

const EB_S16 EbHevcLumaFilterCoeff1[4][8] =
{
{ 0, 0, 0, 64, 0, 0, 0, 0},
{-1, 4,-10, 58, 17, -5, 1, 0},
{-1, 4,-11, 40, 40,-11, 4, -1},
{ 0, 1, -5, 17, 58,-10, 4, -1}
};

static const EB_S16 EbHevcLumaFilterCoeff7[4][8] =
{
{ 0, 0, 0, 64, 0, 0, 0, 0},
{-1, 4,-10, 58, 17, -5, 1, 0},
{-1, 4,-11, 40, 40,-11, 4, -1},
{ 1, -5, 17, 58,-10, 4, -1, 0}
};

#ifndef NON_AVX512_SUPPORT
void LumaInterpolationFilterOneDOutRawHorizontal_AVX512(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why no code is calling LumaInterpolationFilterOneDOutRawHorizontal_AVX512 ?

EB_BYTE refPic,
EB_U32 srcStride,
EB_S16 *dst,
EB_U32 puWidth,
EB_U32 puHeight,
EB_U32 fracPosx)
{
EB_S32 rowCount, colCount;
__m128i c0, c1, c2, c3; // coeffs
__m128i a0, a1;
__m128i b0;
__m128i sum;
EB_BYTE ptr;

refPic -= 3;

PrefetchBlock(refPic, srcStride, (puWidth == 4) ? 16 : puWidth+8, (puWidth == 4) ? ((puHeight+1)&~1) : puHeight);

c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff1[fracPosx]);
c0 = _mm_packs_epi16(c0, c0);
__m128i ct = _mm_srli_epi64(c0, 32);
__m512i cc0 = _mm512_broadcastd_epi32(c0);
__m512i cc1 = _mm512_broadcastd_epi32(ct);
c0 = _mm_unpacklo_epi16(c0, c0);
c3 = _mm_shuffle_epi32(c0, 0xff);
c2 = _mm_shuffle_epi32(c0, 0xaa);
c1 = _mm_shuffle_epi32(c0, 0x55);
c0 = _mm_shuffle_epi32(c0, 0x00);
__m512i b1 = _mm512_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0);
__m512i b2 = _mm512_set_epi8(14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4);


if (puWidth & 4)
{
ptr = refPic;
rowCount = puHeight;
do
{
a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride;
a1 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride;
b0 = _mm_unpacklo_epi64(a0, a1);
sum = _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c0);
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c1));
b0 = _mm_unpacklo_epi64(_mm_srli_si128(a0, 4), _mm_srli_si128(a1, 4));
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c2));
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c3));

sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64));

_mm_storeu_si128((__m128i *)dst, sum);
dst += 8;

rowCount -= 2;
}
while (rowCount > 0);

puWidth -= 4;
if (puWidth == 0)
{
return;
}

refPic += 4;
}
colCount = puWidth;
int rowLoop = puHeight >>1 ;//divide by 2
int evenRow = puHeight & 1;
do
{
ptr = refPic;
// rowCount = puHeight;
int rowCount = rowLoop ;//divide by 2
do
{
__m512i a1 = _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr)));
__m256i b0 = _mm256_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr + srcStride))); ptr += 2 * srcStride;
__m512i s1 = _mm512_inserti64x4(a1, b0, 1);
__m512i sh2 = _mm512_shuffle_epi8(s1, b1);
__m512i sh3 = _mm512_shuffle_epi8(s1, b2);
__m512i sum00 = _mm512_setzero_si512();
__m512i sum0 = _mm512_dpbusds_epi32(sum00, sh2, cc0);
__m512i sum1 = _mm512_dpbusds_epi32(sum0, sh3, cc1);
__m512i f1 = _mm512_packs_epi32(sum1,sum1);//
__m512i f2 = _mm512_permutexvar_epi64( _mm512_setr_epi64(0x0, 0x0000000000000002, 0x0000000000000004, 0x0000000000000006, 0x0, 0x0002000200020002, 0x0004000400040004, 0x0006000600060006), f1);
f2 = _mm512_sub_epi16(f2, _mm512_set1_epi16(128 * 64));
_mm256_storeu_si256((__m256i*)dst, _mm512_castsi512_si256(f2));
dst += 16;
rowCount = rowCount - 1;
}
while (rowCount > 0);

if (evenRow)
{
a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride;

sum = _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8)), c0);
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10)), c1));
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12)), c2));
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14)), c3));

sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64));

_mm_storeu_si128((__m128i *)dst, sum);
dst += 8;
}

refPic += 8;
colCount -= 8;
}
while (colCount > 0);
}
#endif
#endif
Loading