diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp index 9d664c02..5c7da279 100644 --- a/source/Lib/CommonLib/arm/InitARM.cpp +++ b/source/Lib/CommonLib/arm/InitARM.cpp @@ -118,8 +118,14 @@ void TCoeffOps::initTCoeffOpsARM() { _initTCoeffOpsARM(); } +#if TARGET_SIMD_ARM_SVE + if( vext >= SVE ) + { + _initTCoeffOpsARM(); + } +#endif // TARGET_SIMD_ARM_SVE } -#endif +#endif // ENABLE_SIMD_TRAFO #if ENABLE_SIMD_OPT_BDOF void InterPredInterpolation::initInterPredictionARM() @@ -135,8 +141,6 @@ void InterPredInterpolation::initInterPredictionARM() } #endif - - -#endif // TARGET_SIMD_ARM +#endif // TARGET_SIMD_ARM } // namespace diff --git a/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp b/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp new file mode 100644 index 00000000..474827b5 --- /dev/null +++ b/source/Lib/CommonLib/arm/sve/Trafo_sve.cpp @@ -0,0 +1,251 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ +/** + * \file Trafo_sve.cpp + * \brief SVE implementation of TCoeffOps for AArch64. + */ + +// ==================================================================================================================== +// Includes +// ==================================================================================================================== +#include "CommonDefARM.h" +#include "CommonLib/CommonDef.h" + +#include "TrQuant.h" +#include "TrQuant_EMT.h" + +//! \ingroup CommonLib +//! \{ + +#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_TRAFO + +#include +#include + +namespace vvenc +{ + +static svint16_t load_narrow_to_s16( const int32_t* src ) +{ + svint32_t lo = svld1_vnum_s32( svptrue_b32(), src, 0 ); + svint32_t hi = svld1_vnum_s32( svptrue_b32(), src, 1 ); + return svuzp1_s16( svreinterpret_s16_s32( lo ), svreinterpret_s16_s32( hi ) ); +} + +static int64_t shift_and_round( int64_t x, int shift ) +{ + return ( x + ( 1 << ( shift - 1 ) ) ) >> shift; +} + +template +static inline void fastFwdCore_reduce_x4_sve( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3, + int shift ); + +template<> +inline void fastFwdCore_reduce_x4_sve<128>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3, + int shift ) +{ + // For a 128-bit vector length we do not need to reduce the sum down, use + // svget_neonq to operate on the Neon vectors directly so we can use pairwise + // additions to incrementally sum each vector. + int64x2_t v01 = vpaddq_s64( svget_neonq_s64( v0 ), svget_neonq_s64( v1 ) ); + int64x2_t v23 = vpaddq_s64( svget_neonq_s64( v2 ), svget_neonq_s64( v3 ) ); + int32x4_t v0123 = vuzp1q_s32( vreinterpretq_s32_s64( v01 ), vreinterpretq_s32_s64( v23 ) ); + v0123 = vrshlq_s32( v0123, vdupq_n_s32( -shift ) ); + vst1q_s32( dst, v0123 ); +} + +template<> +inline void fastFwdCore_reduce_x4_sve<256>( TCoeff* dst, svint64_t v0, svint64_t v1, svint64_t v2, svint64_t v3, + int shift ) +{ + // Halve the data width such that we only utilise the low half (128 bits) of each vector. + svint32_t v0_s32 = svuzp1_s32( svreinterpret_s32_s64( v0 ), svreinterpret_s32_s64( v0 ) ); + svint32_t v1_s32 = svuzp1_s32( svreinterpret_s32_s64( v1 ), svreinterpret_s32_s64( v1 ) ); + svint32_t v2_s32 = svuzp1_s32( svreinterpret_s32_s64( v2 ), svreinterpret_s32_s64( v2 ) ); + svint32_t v3_s32 = svuzp1_s32( svreinterpret_s32_s64( v3 ), svreinterpret_s32_s64( v3 ) ); + + // Now that we have data in the low 128 bits of each vector, use svget_neonq + // to operate on the Neon vectors directly and use pairwise additions to + // incrementally sum each vector. + int32x4_t v01 = vpaddq_s32( svget_neonq_s32( v0_s32 ), svget_neonq_s32( v1_s32 ) ); + int32x4_t v23 = vpaddq_s32( svget_neonq_s32( v2_s32 ), svget_neonq_s32( v3_s32 ) ); + int32x4_t v0123 = vpaddq_s32( v01, v23 ); + v0123 = vrshlq_s32( v0123, vdupq_n_s32( -shift ) ); + vst1q_s32( dst, v0123 ); +} + +template +static void fastFwdCore_nVec_sve( const TMatrixCoeff* tc, const TCoeff* src, TCoeff* dst, unsigned line, + unsigned reducedLine, unsigned cutoff, int shift ) +{ + CHECK( cutoff % 4 != 0, "Cutoff should be a multiple of four" ); + CHECK( cutoff == 0, "Cutoff should be non-zero" ); + CHECK( shift == 0, "Shift must be at least one" ); + + unsigned trSize = trVecs * svcnth(); + unsigned i = 0; + for( ; i < ( reducedLine & ~3U ); i += 4 ) + { + for( int j = 0; j < cutoff; j += 4 ) + { + const TMatrixCoeff* tcj = tc + j * trSize; + const TCoeff* srci = src + i * trSize; + + svint64_t sum00 = svdup_n_s64( 0 ); + svint64_t sum01 = svdup_n_s64( 0 ); + svint64_t sum02 = svdup_n_s64( 0 ); + svint64_t sum03 = svdup_n_s64( 0 ); + svint64_t sum10 = svdup_n_s64( 0 ); + svint64_t sum11 = svdup_n_s64( 0 ); + svint64_t sum12 = svdup_n_s64( 0 ); + svint64_t sum13 = svdup_n_s64( 0 ); + svint64_t sum20 = svdup_n_s64( 0 ); + svint64_t sum21 = svdup_n_s64( 0 ); + svint64_t sum22 = svdup_n_s64( 0 ); + svint64_t sum23 = svdup_n_s64( 0 ); + svint64_t sum30 = svdup_n_s64( 0 ); + svint64_t sum31 = svdup_n_s64( 0 ); + svint64_t sum32 = svdup_n_s64( 0 ); + svint64_t sum33 = svdup_n_s64( 0 ); + for( int k = 0; k < trVecs; ++k ) + { + svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize ); + svint16_t s1 = load_narrow_to_s16( srci + 1 * trSize ); + svint16_t s2 = load_narrow_to_s16( srci + 2 * trSize ); + svint16_t s3 = load_narrow_to_s16( srci + 3 * trSize ); + svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize ); + svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize ); + svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize ); + svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize ); + sum00 = svdot_s64( sum00, s0, c0 ); + sum01 = svdot_s64( sum01, s0, c1 ); + sum02 = svdot_s64( sum02, s0, c2 ); + sum03 = svdot_s64( sum03, s0, c3 ); + sum10 = svdot_s64( sum10, s1, c0 ); + sum11 = svdot_s64( sum11, s1, c1 ); + sum12 = svdot_s64( sum12, s1, c2 ); + sum13 = svdot_s64( sum13, s1, c3 ); + sum20 = svdot_s64( sum20, s2, c0 ); + sum21 = svdot_s64( sum21, s2, c1 ); + sum22 = svdot_s64( sum22, s2, c2 ); + sum23 = svdot_s64( sum23, s2, c3 ); + sum30 = svdot_s64( sum30, s3, c0 ); + sum31 = svdot_s64( sum31, s3, c1 ); + sum32 = svdot_s64( sum32, s3, c2 ); + sum33 = svdot_s64( sum33, s3, c3 ); + + srci += svcnth(); + tcj += svcnth(); + } + TCoeff* dstij = dst + j * line + i; + fastFwdCore_reduce_x4_sve( dstij + 0 * line, sum00, sum10, sum20, sum30, shift ); + fastFwdCore_reduce_x4_sve( dstij + 1 * line, sum01, sum11, sum21, sum31, shift ); + fastFwdCore_reduce_x4_sve( dstij + 2 * line, sum02, sum12, sum22, sum32, shift ); + fastFwdCore_reduce_x4_sve( dstij + 3 * line, sum03, sum13, sum23, sum33, shift ); + } + } + for( ; i < reducedLine; ++i ) + { + for( int j = 0; j < cutoff; j += 4 ) + { + const TMatrixCoeff* tcj = tc + j * trSize; + const TCoeff* srci = src + i * trSize; + + svint64_t sum00 = svdup_n_s64( 0 ); + svint64_t sum01 = svdup_n_s64( 0 ); + svint64_t sum02 = svdup_n_s64( 0 ); + svint64_t sum03 = svdup_n_s64( 0 ); + for( int k = 0; k < trVecs; ++k ) + { + svint16_t s0 = load_narrow_to_s16( srci + 0 * trSize ); + svint16_t c0 = svld1_s16( svptrue_b16(), tcj + 0 * trSize ); + svint16_t c1 = svld1_s16( svptrue_b16(), tcj + 1 * trSize ); + svint16_t c2 = svld1_s16( svptrue_b16(), tcj + 2 * trSize ); + svint16_t c3 = svld1_s16( svptrue_b16(), tcj + 3 * trSize ); + sum00 = svdot_s64( sum00, s0, c0 ); + sum01 = svdot_s64( sum01, s0, c1 ); + sum02 = svdot_s64( sum02, s0, c2 ); + sum03 = svdot_s64( sum03, s0, c3 ); + + srci += svcnth(); + tcj += svcnth(); + } + TCoeff* dstij = dst + j * line + i; + dstij[ 0 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum00 ), shift ); + dstij[ 1 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum01 ), shift ); + dstij[ 2 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum02 ), shift ); + dstij[ 3 * line + 0 ] = shift_and_round( svaddv_s64( svptrue_b64(), sum03 ), shift ); + } + } +} + +template<> +void TCoeffOps::_initTCoeffOpsARM() +{ + // Wire up kernels based on how many vector iterations we need in the inner + // loop. Use Neon if we don't have at least one vector of work to do. Arm + // Neoverse micro-architectures only currently exist with vector lengths of + // 128 and 256 bits, so don't bother specialising for other vector lengths. + switch( svcnth() ) + { + case 8: // SVE VL = 128-bits + fastFwdCore_2D[ 1 ] = fastFwdCore_nVec_sve<128, 1>; + fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<128, 2>; + fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<128, 4>; + fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<128, 8>; + break; + case 16: // SVE VL = 256-bits + fastFwdCore_2D[ 2 ] = fastFwdCore_nVec_sve<256, 1>; + fastFwdCore_2D[ 3 ] = fastFwdCore_nVec_sve<256, 2>; + fastFwdCore_2D[ 4 ] = fastFwdCore_nVec_sve<256, 4>; + break; + default: + // Don't use SVE for other vector lengths, fall back to Neon. + break; + } +} + +} // namespace vvenc + +#endif +//! \}