Skip to content

Commit

Permalink
Add AArch64 SVE implementation for TCoeffOps fastFwdCore_2D
Browse files Browse the repository at this point in the history
The SVE 16-bit dot-product instructions allow us to accumulate twice as
much data per instruction compared to Neon multiply-add instructions,
giving a good speedup for the fastFwdCore_2D kernels.

Compared to Neon with a fixed vector length of 128 bits, SVE allows
different micro-architectures to expose a number of different vector
lengths: 128, 256, 512, 1024, or 2048 bits. To take advantage of this we
can rewrite the innermost loop of fastFwdCore_2D to be expressed in
terms of the number of vectors to process rather than the number of
elements, and then pick the number of iterations at setup-time by
inspecting the vector length. This allows us to largely avoid needing an
entire set of kernels for each possible vector length.

One caveat to the notion of having completely vector-length agnostic
kernels is that when the vector-length is known to be exactly 128-bits
(the same as Neon) we can make use of some Neon instructions to speed up
processing the data after the accumulation. This is possible since Neon
and SVE registers share the low 128-bits of each vector register.

For this commit we have not attempted to add kernels that process less
than a full vector's worth of data per inner loop iteration, which would
enable using these kernels on machines with very wide vectors (512,
1024, or 2048 bits). This is technically straightforward since SVE
supports partial vectors via predication, however there are no known
long-vector micro-architectures available at present to justify
maintaining such code.

Running a video encoding job on SVE-capable machines using the
--preset=fast setting shows the following improvements in reported FPS:

Neoverse V1 (VL=256 bits): ~1.3%
Neoverse V2 (VL=128 bits): ~2.6%
  • Loading branch information
georges-arm committed Nov 28, 2024
1 parent 24b36a3 commit 4c8bfef
Show file tree
Hide file tree
Showing 2 changed files with 259 additions and 4 deletions.
12 changes: 8 additions & 4 deletions source/Lib/CommonLib/arm/InitARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,14 @@ void TCoeffOps::initTCoeffOpsARM()
{
_initTCoeffOpsARM<NEON>();
}
#if TARGET_SIMD_ARM_SVE
if( vext >= SVE )
{
_initTCoeffOpsARM<SVE>();
}
#endif // TARGET_SIMD_ARM_SVE
}
#endif
#endif // ENABLE_SIMD_TRAFO

#if ENABLE_SIMD_OPT_BDOF
void InterPredInterpolation::initInterPredictionARM()
Expand All @@ -135,8 +141,6 @@ void InterPredInterpolation::initInterPredictionARM()
}
#endif



#endif // TARGET_SIMD_ARM
#endif // TARGET_SIMD_ARM

} // namespace
251 changes: 251 additions & 0 deletions source/Lib/CommonLib/arm/sve/Trafo_sve.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
/* -----------------------------------------------------------------------------
The copyright in this software is being made available under the Clear BSD
License, included below. No patent rights, trademark rights and/or
other Intellectual Property Rights other than the copyrights concerning
the Software are granted under this license.
The Clear BSD License
Copyright (c) 2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted (subject to the limitations in the disclaimer below) provided that
the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------------------- */
/**
* \file Trafo_sve.cpp
* \brief SVE implementation of TCoeffOps for AArch64.
*/

// ====================================================================================================================
// Includes
// ====================================================================================================================
#include "CommonDefARM.h"
#include "CommonLib/CommonDef.h"

#include "TrQuant.h"
#include "TrQuant_EMT.h"

//! \ingroup CommonLib
//! \{

#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_TRAFO

#include <arm_neon_sve_bridge.h>
#include <arm_sve.h>

namespace vvenc
{

static svint16_t load_narrow_to_s16( const int32_t* src )
{
svint32_t lo = svld1_vnum_s32( svptrue_b32(), src, 0 );
svint32_t hi = svld1_vnum_s32( svptrue_b32(), src, 1 );
return svuzp1_s16( svreinterpret_s16_s32( lo ), svreinterpret_s16_s32( hi ) );
}

static int64_t shift_and_round( int64_t x, int shift )
{
return ( x + ( 1 << ( shift - 1 ) ) ) >> shift;
}

template<int vlBits>
static inline void fastFwdCore_reduce_x4_sve( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
int shift );

template<>
inline void fastFwdCore_reduce_x4_sve<128>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
int shift )
{
// For a 128-bit vector length we do not need to reduce the sum down, use
// svget_neonq to operate on the Neon vectors directly so we can use pairwise
// additions to incrementally sum each vector.
int64x2_t v01 = vpaddq_s64( svget_neonq_s64( v0 ), svget_neonq_s64( v1 ) );
int64x2_t v23 = vpaddq_s64( svget_neonq_s64( v2 ), svget_neonq_s64( v3 ) );
int32x4_t v0123 = vuzp1q_s32( vreinterpretq_s32_s64( v01 ), vreinterpretq_s32_s64( v23 ) );
v0123 = vrshlq_s32( v0123, vdupq_n_s32( -shift ) );
vst1q_s32( dst, v0123 );
}

template<>
inline void fastFwdCore_reduce_x4_sve<256>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
int shift )
{
// Halve the data width such that we only utilise the low half (128 bits) of each vector.
svint32_t v0_s32 = svuzp1_s32( svreinterpret_s32_s64( v0 ), svreinterpret_s32_s64( v0 ) );
svint32_t v1_s32 = svuzp1_s32( svreinterpret_s32_s64( v1 ), svreinterpret_s32_s64( v1 ) );
svint32_t v2_s32 = svuzp1_s32( svreinterpret_s32_s64( v2 ), svreinterpret_s32_s64( v2 ) );
svint32_t v3_s32 = svuzp1_s32( svreinterpret_s32_s64( v3 ), svreinterpret_s32_s64( v3 ) );

// Now that we have data in the low 128 bits of each vector, use svget_neonq
// to operate on the Neon vectors directly and use pairwise additions to
// incrementally sum each vector.
int32x4_t v01 = vpaddq_s32( svget_neonq_s32( v0_s32 ), svget_neonq_s32( v1_s32 ) );
int32x4_t v23 = vpaddq_s32( svget_neonq_s32( v2_s32 ), svget_neonq_s32( v3_s32 ) );
int32x4_t v0123 = vpaddq_s32( v01, v23 );
v0123 = vrshlq_s32( v0123, vdupq_n_s32( -shift ) );
vst1q_s32( dst, v0123 );
}

template<int vlBits, unsigned trVecs>
static void fastFwdCore_nVec_sve( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line,
unsigned reducedLine, unsigned cutoff, int shift )
{
CHECK( cutoff % 4 != 0, "Cutoff should be a multiple of four" );
CHECK( cutoff == 0, "Cutoff should be non-zero" );
CHECK( shift == 0, "Shift must be at least one" );

unsigned trSize = trVecs * svcnth();
unsigned i = 0;
for( ; i < ( reducedLine & ~3U ); i += 4 )
{
for( int j = 0; j < cutoff; j += 4 )
{
const TMatrixCoeff* tcj = tc + j * trSize;
const TCoeff* srci = src + i * trSize;

svint64_t sum00 = svdup_n_s64( 0 );
svint64_t sum01 = svdup_n_s64( 0 );
svint64_t sum02 = svdup_n_s64( 0 );
svint64_t sum03 = svdup_n_s64( 0 );
svint64_t sum10 = svdup_n_s64( 0 );
svint64_t sum11 = svdup_n_s64( 0 );
svint64_t sum12 = svdup_n_s64( 0 );
svint64_t sum13 = svdup_n_s64( 0 );
svint64_t sum20 = svdup_n_s64( 0 );
svint64_t sum21 = svdup_n_s64( 0 );
svint64_t sum22 = svdup_n_s64( 0 );
svint64_t sum23 = svdup_n_s64( 0 );
svint64_t sum30 = svdup_n_s64( 0 );
svint64_t sum31 = svdup_n_s64( 0 );
svint64_t sum32 = svdup_n_s64( 0 );
svint64_t sum33 = svdup_n_s64( 0 );
for( int k = 0; k < trVecs; ++k )
{
svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize );
svint16_t s1 = load_narrow_to_s16( srci + 1 * trSize );
svint16_t s2 = load_narrow_to_s16( srci + 2 * trSize );
svint16_t s3 = load_narrow_to_s16( srci + 3 * trSize );
svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize );
svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize );
svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize );
svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize );
sum00 = svdot_s64( sum00, s0, c0 );
sum01 = svdot_s64( sum01, s0, c1 );
sum02 = svdot_s64( sum02, s0, c2 );
sum03 = svdot_s64( sum03, s0, c3 );
sum10 = svdot_s64( sum10, s1, c0 );
sum11 = svdot_s64( sum11, s1, c1 );
sum12 = svdot_s64( sum12, s1, c2 );
sum13 = svdot_s64( sum13, s1, c3 );
sum20 = svdot_s64( sum20, s2, c0 );
sum21 = svdot_s64( sum21, s2, c1 );
sum22 = svdot_s64( sum22, s2, c2 );
sum23 = svdot_s64( sum23, s2, c3 );
sum30 = svdot_s64( sum30, s3, c0 );
sum31 = svdot_s64( sum31, s3, c1 );
sum32 = svdot_s64( sum32, s3, c2 );
sum33 = svdot_s64( sum33, s3, c3 );

srci += svcnth();
tcj += svcnth();
}
TCoeff* dstij = dst + j * line + i;
fastFwdCore_reduce_x4_sve<vlBits>( dstij + 0 * line, sum00, sum10, sum20, sum30, shift );
fastFwdCore_reduce_x4_sve<vlBits>( dstij + 1 * line, sum01, sum11, sum21, sum31, shift );
fastFwdCore_reduce_x4_sve<vlBits>( dstij + 2 * line, sum02, sum12, sum22, sum32, shift );
fastFwdCore_reduce_x4_sve<vlBits>( dstij + 3 * line, sum03, sum13, sum23, sum33, shift );
}
}
for( ; i < reducedLine; ++i )
{
for( int j = 0; j < cutoff; j += 4 )
{
const TMatrixCoeff* tcj = tc + j * trSize;
const TCoeff* srci = src + i * trSize;

svint64_t sum00 = svdup_n_s64( 0 );
svint64_t sum01 = svdup_n_s64( 0 );
svint64_t sum02 = svdup_n_s64( 0 );
svint64_t sum03 = svdup_n_s64( 0 );
for( int k = 0; k < trVecs; ++k )
{
svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize );
svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize );
svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize );
svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize );
svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize );
sum00 = svdot_s64( sum00, s0, c0 );
sum01 = svdot_s64( sum01, s0, c1 );
sum02 = svdot_s64( sum02, s0, c2 );
sum03 = svdot_s64( sum03, s0, c3 );

srci += svcnth();
tcj += svcnth();
}
TCoeff* dstij = dst + j * line + i;
dstij[ 0 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum00 ), shift );
dstij[ 1 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum01 ), shift );
dstij[ 2 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum02 ), shift );
dstij[ 3 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum03 ), shift );
}
}
}

template<>
void TCoeffOps::_initTCoeffOpsARM<SVE>()
{
// Wire up kernels based on how many vector iterations we need in the inner
// loop. Use Neon if we don't have at least one vector of work to do. Arm
// Neoverse micro-architectures only currently exist with vector lengths of
// 128 and 256 bits, so don't bother specialising for other vector lengths.
switch( svcnth() )
{
case 8: // SVE VL = 128-bits
fastFwdCore_2D[ 1 ] = fastFwdCore_nVec_sve<128, 1>;
fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<128, 2>;
fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<128, 4>;
fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<128, 8>;
break;
case 16: // SVE VL = 256-bits
fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<256, 1>;
fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<256, 2>;
fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<256, 4>;
break;
default:
// Don't use SVE for other vector lengths, fall back to Neon.
break;
}
}

} // namespace vvenc

#endif
//! \}

0 comments on commit 4c8bfef

Please sign in to comment.