This repository has been archived by the owner on Jul 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 169
AVX512_changes #478
Open
deeptiag1
wants to merge
13
commits into
OpenVisualCloud:master
Choose a base branch
from
deeptiag1:vnni_changes_1
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
AVX512_changes #478
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
2c2c7e4
AVX512_changes
157cbe8
AVX512_changes_P2
deeptiag1 4093056
AVX512_changes_P3
deeptiag1 b00574f
AVX512_changes_P4
deeptiag1 b827613
AVX512_changes_P4
deeptiag1 d14b735
AVX512_changes_P4
deeptiag1 0bc4093
AVX512_changes_P4
deeptiag1 67ec8bf
AVX512_changes_Revert
deeptiag1 d92a7c4
changes_incorpoarted
deeptiag1 d983a3b
avx512-changes_v3
deeptiag1 06ec13d
avx512-changes_v3
deeptiag1 0f5ce17
avx512-changes_v3
deeptiag1 a23033f
avx512-changes_v3
deeptiag1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#include "EbMcp_SSSE3.h" | ||
#include "EbDefinitions.h" | ||
|
||
#include "immintrin.h" | ||
|
||
#ifdef VNNI_SUPPORT | ||
|
||
const EB_S16 EbHevcLumaFilterCoeff1[4][8] = | ||
{ | ||
{ 0, 0, 0, 64, 0, 0, 0, 0}, | ||
{-1, 4,-10, 58, 17, -5, 1, 0}, | ||
{-1, 4,-11, 40, 40,-11, 4, -1}, | ||
{ 0, 1, -5, 17, 58,-10, 4, -1} | ||
}; | ||
|
||
static const EB_S16 EbHevcLumaFilterCoeff7[4][8] = | ||
{ | ||
{ 0, 0, 0, 64, 0, 0, 0, 0}, | ||
{-1, 4,-10, 58, 17, -5, 1, 0}, | ||
{-1, 4,-11, 40, 40,-11, 4, -1}, | ||
{ 1, -5, 17, 58,-10, 4, -1, 0} | ||
}; | ||
|
||
#ifndef NON_AVX512_SUPPORT | ||
void LumaInterpolationFilterOneDOutRawHorizontal_AVX512( | ||
EB_BYTE refPic, | ||
EB_U32 srcStride, | ||
EB_S16 *dst, | ||
EB_U32 puWidth, | ||
EB_U32 puHeight, | ||
EB_U32 fracPosx) | ||
{ | ||
EB_S32 rowCount, colCount; | ||
__m128i c0, c1, c2, c3; // coeffs | ||
__m128i a0, a1; | ||
__m128i b0; | ||
__m128i sum; | ||
EB_BYTE ptr; | ||
|
||
refPic -= 3; | ||
|
||
PrefetchBlock(refPic, srcStride, (puWidth == 4) ? 16 : puWidth+8, (puWidth == 4) ? ((puHeight+1)&~1) : puHeight); | ||
|
||
c0 = _mm_loadu_si128((__m128i *)EbHevcLumaFilterCoeff1[fracPosx]); | ||
c0 = _mm_packs_epi16(c0, c0); | ||
__m128i ct = _mm_srli_epi64(c0, 32); | ||
__m512i cc0 = _mm512_broadcastd_epi32(c0); | ||
__m512i cc1 = _mm512_broadcastd_epi32(ct); | ||
c0 = _mm_unpacklo_epi16(c0, c0); | ||
c3 = _mm_shuffle_epi32(c0, 0xff); | ||
c2 = _mm_shuffle_epi32(c0, 0xaa); | ||
c1 = _mm_shuffle_epi32(c0, 0x55); | ||
c0 = _mm_shuffle_epi32(c0, 0x00); | ||
__m512i b1 = _mm512_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); | ||
__m512i b2 = _mm512_set_epi8(14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4, 14, 13, 12, 11, 13, 12, 11, 10, 12, 11, 10, 9, 11, 10, 9, 8, 10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, 4); | ||
|
||
|
||
if (puWidth & 4) | ||
{ | ||
ptr = refPic; | ||
rowCount = puHeight; | ||
do | ||
{ | ||
a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; | ||
a1 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; | ||
b0 = _mm_unpacklo_epi64(a0, a1); | ||
sum = _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c0); | ||
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c1)); | ||
b0 = _mm_unpacklo_epi64(_mm_srli_si128(a0, 4), _mm_srli_si128(a1, 4)); | ||
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); | ||
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(b0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); | ||
|
||
sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); | ||
|
||
_mm_storeu_si128((__m128i *)dst, sum); | ||
dst += 8; | ||
|
||
rowCount -= 2; | ||
} | ||
while (rowCount > 0); | ||
|
||
puWidth -= 4; | ||
if (puWidth == 0) | ||
{ | ||
return; | ||
} | ||
|
||
refPic += 4; | ||
} | ||
colCount = puWidth; | ||
int rowLoop = puHeight >>1 ;//divide by 2 | ||
int evenRow = puHeight & 1; | ||
do | ||
{ | ||
ptr = refPic; | ||
// rowCount = puHeight; | ||
int rowCount = rowLoop ;//divide by 2 | ||
do | ||
{ | ||
__m512i a1 = _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr))); | ||
__m256i b0 = _mm256_broadcast_i32x4(_mm_loadu_si128((__m128i*)(ptr + srcStride))); ptr += 2 * srcStride; | ||
__m512i s1 = _mm512_inserti64x4(a1, b0, 1); | ||
__m512i sh2 = _mm512_shuffle_epi8(s1, b1); | ||
__m512i sh3 = _mm512_shuffle_epi8(s1, b2); | ||
__m512i sum00 = _mm512_setzero_si512(); | ||
__m512i sum0 = _mm512_dpbusds_epi32(sum00, sh2, cc0); | ||
__m512i sum1 = _mm512_dpbusds_epi32(sum0, sh3, cc1); | ||
__m512i f1 = _mm512_packs_epi32(sum1,sum1);// | ||
__m512i f2 = _mm512_permutexvar_epi64( _mm512_setr_epi64(0x0, 0x0000000000000002, 0x0000000000000004, 0x0000000000000006, 0x0, 0x0002000200020002, 0x0004000400040004, 0x0006000600060006), f1); | ||
f2 = _mm512_sub_epi16(f2, _mm512_set1_epi16(128 * 64)); | ||
_mm256_storeu_si256((__m256i*)dst, _mm512_castsi512_si256(f2)); | ||
dst += 16; | ||
rowCount = rowCount - 1; | ||
} | ||
while (rowCount > 0); | ||
|
||
if (evenRow) | ||
{ | ||
a0 = _mm_loadu_si128((__m128i *)ptr); ptr += srcStride; | ||
|
||
sum = _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8)), c0); | ||
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10)), c1)); | ||
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12)), c2)); | ||
sum = _mm_add_epi16(sum, _mm_maddubs_epi16(_mm_shuffle_epi8(a0, _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14)), c3)); | ||
|
||
sum = _mm_sub_epi16(sum, _mm_set1_epi16(128*64)); | ||
|
||
_mm_storeu_si128((__m128i *)dst, sum); | ||
dst += 8; | ||
} | ||
|
||
refPic += 8; | ||
colCount -= 8; | ||
} | ||
while (colCount > 0); | ||
} | ||
#endif | ||
#endif |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why no code is calling LumaInterpolationFilterOneDOutRawHorizontal_AVX512 ?