Add AArch64 SVE implementation for TCoeffOps fastFwdCore_2D

The SVE 16-bit dot-product instructions allow us to accumulate twice as much data per instruction compared to Neon multiply-add instructions, giving a good speedup for the fastFwdCore_2D kernels. Compared to Neon with a fixed vector length of 128 bits, SVE allows different micro-architectures to expose a number of different vector lengths: 128, 256, 512, 1024, or 2048 bits. To take advantage of this we can rewrite the innermost loop of fastFwdCore_2D to be expressed in terms of the number of vectors to process rather than the number of elements, and then pick the number of iterations at setup-time by inspecting the vector length. This allows us to largely avoid needing an entire set of kernels for each possible vector length. One caveat to the notion of having completely vector-length agnostic kernels is that when the vector-length is known to be exactly 128-bits (the same as Neon) we can make use of some Neon instructions to speed up processing the data after the accumulation. This is possible since Neon and SVE registers share the low 128-bits of each vector register. For this commit we have not attempted to add kernels that process less than a full vector's worth of data per inner loop iteration, which would enable using these kernels on machines with very wide vectors (512, 1024, or 2048 bits). This is technically straightforward since SVE supports partial vectors via predication, however there are no known long-vector micro-architectures available at present to justify maintaining such code. Running a video encoding job on SVE-capable machines using the --preset=fast setting shows the following improvements in reported FPS: Neoverse V1 (VL=256 bits): ~1.3% Neoverse V2 (VL=128 bits): ~2.6%
fraunhoferhhi · Nov 28, 2024 · 4c8bfef · 4c8bfef
1 parent 24b36a3
commit 4c8bfef
Show file tree

Hide file tree

Showing 2 changed files with 259 additions and 4 deletions.
diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp
@@ -118,8 +118,14 @@ void TCoeffOps::initTCoeffOpsARM()
   {
     _initTCoeffOpsARM<NEON>();
   }
+#if TARGET_SIMD_ARM_SVE
+  if( vext >= SVE )
+  {
+    _initTCoeffOpsARM<SVE>();
+  }
+#endif  // TARGET_SIMD_ARM_SVE
 }
-#endif
+#endif  // ENABLE_SIMD_TRAFO
 
 #if ENABLE_SIMD_OPT_BDOF
 void InterPredInterpolation::initInterPredictionARM()
@@ -135,8 +141,6 @@ void InterPredInterpolation::initInterPredictionARM()
 }
 #endif
 
-
-
-#endif   // TARGET_SIMD_ARM
+#endif  // TARGET_SIMD_ARM
 
 }   // namespace
diff --git a/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp b/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp
@@ -0,0 +1,251 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+/**
+ * \file Trafo_sve.cpp
+ * \brief SVE implementation of TCoeffOps for AArch64.
+ */
+
+//  ====================================================================================================================
+//  Includes
+//  ====================================================================================================================
+#include "CommonDefARM.h"
+#include "CommonLib/CommonDef.h"
+
+#include "TrQuant.h"
+#include "TrQuant_EMT.h"
+
+//! \ingroup CommonLib
+//! \{
+
+#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_TRAFO
+
+#include <arm_neon_sve_bridge.h>
+#include <arm_sve.h>
+
+namespace vvenc
+{
+
+static svint16_t load_narrow_to_s16( const int32_t* src )
+{
+  svint32_t lo = svld1_vnum_s32( svptrue_b32(), src, 0 );
+  svint32_t hi = svld1_vnum_s32( svptrue_b32(), src, 1 );
+  return svuzp1_s16( svreinterpret_s16_s32( lo ), svreinterpret_s16_s32( hi ) );
+}
+
+static int64_t shift_and_round( int64_t x, int shift )
+{
+  return ( x + ( 1 << ( shift - 1 ) ) ) >> shift;
+}
+
+template<int vlBits>
+static inline void fastFwdCore_reduce_x4_sve( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
+                                              int shift );
+
+template<>
+inline void fastFwdCore_reduce_x4_sve<128>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
+                                            int shift )
+{
+  // For a 128-bit vector length we do not need to reduce the sum down, use
+  // svget_neonq to operate on the Neon vectors directly so we can use pairwise
+  // additions to incrementally sum each vector.
+  int64x2_t v01   = vpaddq_s64( svget_neonq_s64( v0 ), svget_neonq_s64( v1 ) );
+  int64x2_t v23   = vpaddq_s64( svget_neonq_s64( v2 ), svget_neonq_s64( v3 ) );
+  int32x4_t v0123 = vuzp1q_s32( vreinterpretq_s32_s64( v01 ), vreinterpretq_s32_s64( v23 ) );
+  v0123           = vrshlq_s32( v0123, vdupq_n_s32( -shift ) );
+  vst1q_s32( dst, v0123 );
+}
+
+template<>
+inline void fastFwdCore_reduce_x4_sve<256>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3,
+                                            int shift )
+{
+  // Halve the data width such that we only utilise the low half (128 bits) of each vector.
+  svint32_t v0_s32 = svuzp1_s32( svreinterpret_s32_s64( v0 ), svreinterpret_s32_s64( v0 ) );
+  svint32_t v1_s32 = svuzp1_s32( svreinterpret_s32_s64( v1 ), svreinterpret_s32_s64( v1 ) );
+  svint32_t v2_s32 = svuzp1_s32( svreinterpret_s32_s64( v2 ), svreinterpret_s32_s64( v2 ) );
+  svint32_t v3_s32 = svuzp1_s32( svreinterpret_s32_s64( v3 ), svreinterpret_s32_s64( v3 ) );
+
+  // Now that we have data in the low 128 bits of each vector, use svget_neonq
+  // to operate on the Neon vectors directly and use pairwise additions to
+  // incrementally sum each vector.
+  int32x4_t v01   = vpaddq_s32( svget_neonq_s32( v0_s32 ), svget_neonq_s32( v1_s32 ) );
+  int32x4_t v23   = vpaddq_s32( svget_neonq_s32( v2_s32 ), svget_neonq_s32( v3_s32 ) );
+  int32x4_t v0123 = vpaddq_s32( v01, v23 );
+  v0123           = vrshlq_s32( v0123, vdupq_n_s32( -shift ) );
+  vst1q_s32( dst, v0123 );
+}
+
+template<int vlBits, unsigned trVecs>
+static void fastFwdCore_nVec_sve( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line,
+                                  unsigned reducedLine, unsigned cutoff, int shift )
+{
+  CHECK( cutoff % 4 != 0, "Cutoff should be a multiple of four" );
+  CHECK( cutoff == 0, "Cutoff should be non-zero" );
+  CHECK( shift == 0, "Shift must be at least one" );
+
+  unsigned trSize = trVecs * svcnth();
+  unsigned i      = 0;
+  for( ; i < ( reducedLine & ~3U ); i += 4 )
+  {
+    for( int j = 0; j < cutoff; j += 4 )
+    {
+      const TMatrixCoeff* tcj = tc + j * trSize;
+      const TCoeff* srci      = src + i * trSize;
+
+      svint64_t sum00 = svdup_n_s64( 0 );
+      svint64_t sum01 = svdup_n_s64( 0 );
+      svint64_t sum02 = svdup_n_s64( 0 );
+      svint64_t sum03 = svdup_n_s64( 0 );
+      svint64_t sum10 = svdup_n_s64( 0 );
+      svint64_t sum11 = svdup_n_s64( 0 );
+      svint64_t sum12 = svdup_n_s64( 0 );
+      svint64_t sum13 = svdup_n_s64( 0 );
+      svint64_t sum20 = svdup_n_s64( 0 );
+      svint64_t sum21 = svdup_n_s64( 0 );
+      svint64_t sum22 = svdup_n_s64( 0 );
+      svint64_t sum23 = svdup_n_s64( 0 );
+      svint64_t sum30 = svdup_n_s64( 0 );
+      svint64_t sum31 = svdup_n_s64( 0 );
+      svint64_t sum32 = svdup_n_s64( 0 );
+      svint64_t sum33 = svdup_n_s64( 0 );
+      for( int k = 0; k < trVecs; ++k )
+      {
+        svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize );
+        svint16_t s1 = load_narrow_to_s16( srci + 1 * trSize );
+        svint16_t s2 = load_narrow_to_s16( srci + 2 * trSize );
+        svint16_t s3 = load_narrow_to_s16( srci + 3 * trSize );
+        svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize );
+        svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize );
+        svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize );
+        svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize );
+        sum00        = svdot_s64( sum00, s0, c0 );
+        sum01        = svdot_s64( sum01, s0, c1 );
+        sum02        = svdot_s64( sum02, s0, c2 );
+        sum03        = svdot_s64( sum03, s0, c3 );
+        sum10        = svdot_s64( sum10, s1, c0 );
+        sum11        = svdot_s64( sum11, s1, c1 );
+        sum12        = svdot_s64( sum12, s1, c2 );
+        sum13        = svdot_s64( sum13, s1, c3 );
+        sum20        = svdot_s64( sum20, s2, c0 );
+        sum21        = svdot_s64( sum21, s2, c1 );
+        sum22        = svdot_s64( sum22, s2, c2 );
+        sum23        = svdot_s64( sum23, s2, c3 );
+        sum30        = svdot_s64( sum30, s3, c0 );
+        sum31        = svdot_s64( sum31, s3, c1 );
+        sum32        = svdot_s64( sum32, s3, c2 );
+        sum33        = svdot_s64( sum33, s3, c3 );
+
+        srci += svcnth();
+        tcj += svcnth();
+      }
+      TCoeff* dstij = dst + j * line + i;
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 0 * line, sum00, sum10, sum20, sum30, shift );
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 1 * line, sum01, sum11, sum21, sum31, shift );
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 2 * line, sum02, sum12, sum22, sum32, shift );
+      fastFwdCore_reduce_x4_sve<vlBits>( dstij + 3 * line, sum03, sum13, sum23, sum33, shift );
+    }
+  }
+  for( ; i < reducedLine; ++i )
+  {
+    for( int j = 0; j < cutoff; j += 4 )
+    {
+      const TMatrixCoeff* tcj = tc + j * trSize;
+      const TCoeff* srci      = src + i * trSize;
+
+      svint64_t sum00 = svdup_n_s64( 0 );
+      svint64_t sum01 = svdup_n_s64( 0 );
+      svint64_t sum02 = svdup_n_s64( 0 );
+      svint64_t sum03 = svdup_n_s64( 0 );
+      for( int k = 0; k < trVecs; ++k )
+      {
+        svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize );
+        svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize );
+        svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize );
+        svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize );
+        svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize );
+        sum00        = svdot_s64( sum00, s0, c0 );
+        sum01        = svdot_s64( sum01, s0, c1 );
+        sum02        = svdot_s64( sum02, s0, c2 );
+        sum03        = svdot_s64( sum03, s0, c3 );
+
+        srci += svcnth();
+        tcj += svcnth();
+      }
+      TCoeff* dstij         = dst + j * line + i;
+      dstij[ 0 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum00 ), shift );
+      dstij[ 1 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum01 ), shift );
+      dstij[ 2 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum02 ), shift );
+      dstij[ 3 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum03 ), shift );
+    }
+  }
+}
+
+template<>
+void TCoeffOps::_initTCoeffOpsARM<SVE>()
+{
+  // Wire up kernels based on how many vector iterations we need in the inner
+  // loop. Use Neon if we don't have at least one vector of work to do. Arm
+  // Neoverse micro-architectures only currently exist with vector lengths of
+  // 128 and 256 bits, so don't bother specialising for other vector lengths.
+  switch( svcnth() )
+  {
+  case 8:  // SVE VL = 128-bits
+    fastFwdCore_2D[ 1 ] = fastFwdCore_nVec_sve<128, 1>;
+    fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<128, 2>;
+    fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<128, 4>;
+    fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<128, 8>;
+    break;
+  case 16:  // SVE VL = 256-bits
+    fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<256, 1>;
+    fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<256, 2>;
+    fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<256, 4>;
+    break;
+  default:
+    // Don't use SVE for other vector lengths, fall back to Neon.
+    break;
+  }
+}
+
+}  // namespace vvenc
+
+#endif
+//! \}