From 555a7194b3949ab4cb082ddc5f2fa69ef4965996 Mon Sep 17 00:00:00 2001 From: Paul Licameli Date: Tue, 10 Oct 2023 07:33:00 -0400 Subject: [PATCH] Overload of binary perform_parallel_simd_aligned writes one vector --- .../lib-time-and-pitch/StaffPad/SimdTypes.h | 23 +++++++++++++++++++ .../StaffPad/TimeAndPitch.cpp | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/libraries/lib-time-and-pitch/StaffPad/SimdTypes.h b/libraries/lib-time-and-pitch/StaffPad/SimdTypes.h index ff770924a3ec..86a8517648fc 100644 --- a/libraries/lib-time-and-pitch/StaffPad/SimdTypes.h +++ b/libraries/lib-time-and-pitch/StaffPad/SimdTypes.h @@ -106,6 +106,29 @@ __finl void perform_parallel_simd_aligned(float *a, float *b, int n, const fnc & f(a[i], b[i]); } +// two buffers read, one written +template +__finl void perform_parallel_simd_aligned(float *a, const float *b, int n, const fnc &f) +{ + // fnc& f needs to be a lambda of type [](auto &a, const auto &b){}. + // the autos will be float_x4/float + constexpr int N = 4; + constexpr int byte_size = sizeof(float); + + assert(is_aligned(a, N * byte_size) && is_aligned(b, N * byte_size)); + + for (int i = 0; i <= n - N; i += N) + { + auto x = float_x4_load_aligned(a + i); + auto y = float_x4_load_aligned(b + i); + f(x, y); + store_aligned(x, a + i); + } + // deal with last partial packet + for (int i = n & (~(N - 1)); i < n; ++i) + f(a[i], b[i]); +} + /// template for applying math to one data buffer template __finl void perform_parallel_simd_aligned(float *a, int n, const fnc &f) diff --git a/libraries/lib-time-and-pitch/StaffPad/TimeAndPitch.cpp b/libraries/lib-time-and-pitch/StaffPad/TimeAndPitch.cpp index 0612eb983390..f9390aa026b8 100644 --- a/libraries/lib-time-and-pitch/StaffPad/TimeAndPitch.cpp +++ b/libraries/lib-time-and-pitch/StaffPad/TimeAndPitch.cpp @@ -215,7 +215,7 @@ void _ms_to_lr(float* ch1, float* ch2, int n) template inline void multiply(T* dst, const T* src, int32_t n) { - audio::simd::perform_parallel_simd_aligned(dst, const_cast(src), n, + audio::simd::perform_parallel_simd_aligned(dst, src, n, [](auto& d, auto& s) { d = d * s; }); } } // namespace