From b02293a079e2324df3e60efcdfa3a69bfea4fcc0 Mon Sep 17 00:00:00 2001 From: Martin Bruse Date: Wed, 1 Nov 2023 03:17:04 -0700 Subject: [PATCH] Added HWY_DASSERTs asserting buffer alignment to all Load and Store operations that require alignment. Fixed some bugs where Load/StoreU called Load/Store. PiperOrigin-RevId: 578459068 --- hwy/base.h | 7 ++++- hwy/ops/arm_neon-inl.h | 2 ++ hwy/ops/arm_sve-inl.h | 27 ++++++++++--------- hwy/ops/emu128-inl.h | 18 +++++++------ hwy/ops/ppc_vsx-inl.h | 22 ++++++++++------ hwy/ops/rvv-inl.h | 24 +++++++++-------- hwy/ops/scalar-inl.h | 17 +++++++----- hwy/ops/wasm_128-inl.h | 15 +++++++---- hwy/ops/wasm_256-inl.h | 19 ++++++++------ hwy/ops/x86_128-inl.h | 59 +++++++++++++++++++++++++++--------------- hwy/ops/x86_256-inl.h | 27 ++++++++++++++----- hwy/ops/x86_512-inl.h | 29 ++++++++++++++++----- 12 files changed, 172 insertions(+), 94 deletions(-) diff --git a/hwy/base.h b/hwy/base.h index f5bd885f59..a7b1ebc243 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -54,7 +54,7 @@ #endif // !HWY_IDE -#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \ +#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \ __cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \ !defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE) #if __has_include() @@ -293,6 +293,11 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) } while (0) #endif +#define HWY_DASSERT_ALIGNED(d, addr) \ + HWY_DASSERT(reinterpret_cast(addr) % \ + (Lanes(d) * sizeof(TFromD)) == \ + 0) + #if __cpp_constexpr >= 201304L #define HWY_CXX14_CONSTEXPR constexpr #else diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h index 8c1c784a68..4609dc4918 100644 --- a/hwy/ops/arm_neon-inl.h +++ b/hwy/ops/arm_neon-inl.h @@ -3538,6 +3538,7 @@ HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { // On Arm, Load is the same as LoadU. template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); return LoadU(d, p); } @@ -3742,6 +3743,7 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") // On Arm, Store is the same as StoreU. template HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); StoreU(v, d, aligned); } diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index aba972ae6c..bf649e6536 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -18,6 +18,7 @@ #include +#include "hwy/base.h" #include "hwy/ops/shared-inl.h" // Arm C215 declares that SVE vector lengths will always be a power of two. @@ -1635,10 +1636,10 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq) #if !HWY_SVE_HAVE_BFLOAT16 template -HWY_API VBF16 Load(Simd d, - const bfloat16_t* HWY_RESTRICT p) { - return BitCast(d, Load(RebindToUnsigned(), - reinterpret_cast(p))); +HWY_API VBF16 LoadU(Simd d, + const bfloat16_t* HWY_RESTRICT p) { + return BitCast(d, LoadU(RebindToUnsigned(), + reinterpret_cast(p))); } template @@ -1688,10 +1689,10 @@ HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) { #if !HWY_SVE_HAVE_BFLOAT16 template -HWY_API void Store(VBF16 v, Simd d, - bfloat16_t* HWY_RESTRICT p) { +HWY_API void StoreU(VBF16 v, Simd d, + bfloat16_t* HWY_RESTRICT p) { const RebindToUnsigned du; - Store(BitCast(du, v), du, reinterpret_cast(p)); + StoreU(BitCast(du, v), du, reinterpret_cast(p)); } template @@ -1711,18 +1712,20 @@ HWY_API void BlendedStore(VBF16 v, svbool_t m, Simd d, #endif -// ------------------------------ Load/StoreU +// ------------------------------ Load/Store // SVE only requires lane alignment, not natural alignment of the entire // vector. template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + return LoadU(d, p); } template -HWY_API void StoreU(const V v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); +HWY_API void Store(const V v, D d, TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + StoreU(v, d, p); } // ------------------------------ MaskedLoadOr diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index 451f4a040e..0e372e5fcb 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -1364,9 +1364,9 @@ HWY_API VFromD Max128Upper(D d, VFromD a, VFromD b) { // ------------------------------ Load template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { +HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { VFromD v; - CopyBytes(aligned, v.raw); // copy from array + CopyBytes(p, v.raw); // copy from array return v; } @@ -1383,8 +1383,9 @@ HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, } template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + return LoadU(d, aligned); } // In some use cases, "load single lane" is sufficient; otherwise avoid this. @@ -1422,13 +1423,14 @@ HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, // ------------------------------ Store template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - CopyBytes(v.raw, aligned); // copy to array +HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { + CopyBytes(v.raw, p); // copy to array } template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + StoreU(v, d, aligned); } template diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index 66e102136a..e5ca2e7bf7 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -614,7 +614,9 @@ HWY_API Vec128 CopySignToAbs(Vec128 abs, Vec128 sign) { // ------------------------------ Load template > -HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { +HWY_API Vec128 Load(D d, const T* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; using LoadRaw = typename detail::Raw128::AlignedRawVec; const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned); using ResultRaw = typename detail::Raw128::type; @@ -623,7 +625,7 @@ HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { // Any <= 64 bit template > -HWY_API VFromD Load(D d, const T* HWY_RESTRICT p) { +HWY_API VFromD LoadU(D d, const T* HWY_RESTRICT p) { using BitsT = UnsignedFromSize; BitsT bits; @@ -1072,8 +1074,9 @@ HWY_API Vec128 LoadU(D /* tag */, const T* HWY_RESTRICT p) { // For < 128 bit, LoadU == Load. template > -HWY_API VFromD LoadU(D d, const T* HWY_RESTRICT p) { - return Load(d, p); +HWY_API VFromD Load(D d, const T* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + return LoadU(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. @@ -1212,7 +1215,9 @@ HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, // ------------------------------ Store template > -HWY_API void Store(Vec128 v, D /* tag */, T* HWY_RESTRICT aligned) { +HWY_API void Store(Vec128 v, D d, T* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; using StoreRaw = typename detail::Raw128::AlignedRawVec; *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast(v.raw); } @@ -1224,7 +1229,7 @@ HWY_API void StoreU(Vec128 v, D /* tag */, T* HWY_RESTRICT p) { } template > -HWY_API void Store(VFromD v, D d, T* HWY_RESTRICT p) { +HWY_API void StoreU(VFromD v, D d, T* HWY_RESTRICT p) { using BitsT = UnsignedFromSize; const Repartition d_bits; @@ -1234,8 +1239,9 @@ HWY_API void Store(VFromD v, D d, T* HWY_RESTRICT p) { // For < 128 bit, StoreU == Store. template > -HWY_API void StoreU(VFromD v, D d, T* HWY_RESTRICT p) { - Store(v, d, p); +HWY_API void Store(VFromD v, D d, T* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + StoreU(v, d, p); } #if HWY_PPC_HAVE_9 diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h index 7256e58ba2..b163da4bbf 100644 --- a/hwy/ops/rvv-inl.h +++ b/hwy/ops/rvv-inl.h @@ -1627,15 +1627,16 @@ HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT) // There is no native BF16, treat as int16_t. template -HWY_API VFromD> Load(Simd d, +HWY_API VFromD> LoadU(Simd d, const bfloat16_t* HWY_RESTRICT p) { - return Load(RebindToSigned(), + return LoadU(RebindToSigned(), reinterpret_cast(p)); } template HWY_API void Store(VFromD> v, Simd d, bfloat16_t* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); Store(v, RebindToSigned(), reinterpret_cast(p)); } @@ -1644,26 +1645,26 @@ HWY_API void Store(VFromD> v, // NOTE: different type for float16_t than bfloat16_t, see Set(). template -HWY_API VFromD> Load(Simd d, +HWY_API VFromD> LoadU(Simd d, const float16_t* HWY_RESTRICT p) { return Load(RebindToUnsigned(), reinterpret_cast(p)); } template -HWY_API void Store(VFromD> v, +HWY_API void StoreU(VFromD> v, Simd d, float16_t* HWY_RESTRICT p) { - Store(v, RebindToUnsigned(), + StoreU(v, RebindToUnsigned(), reinterpret_cast(p)); } #endif // !HWY_HAVE_FLOAT16 -// ------------------------------ LoadU template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); // RVV only requires element alignment, not vector alignment. - return Load(d, p); + return LoadU(d, p); } // ------------------------------ MaskedLoad @@ -1858,11 +1859,12 @@ HWY_API void StoreN(VFromD v, D /*d*/, T* HWY_RESTRICT p, reinterpret_cast(p)); } -// ------------------------------ StoreU +// ------------------------------ Store template -HWY_API void StoreU(const V v, D d, TFromD* HWY_RESTRICT p) { +HWY_API void Store(const V v, D d, TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); // RVV only requires element alignment, not vector alignment. - Store(v, d, p); + StoreU(v, d, p); } // ------------------------------ Stream diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h index c113e4fded..f8b9a9aa6b 100644 --- a/hwy/ops/scalar-inl.h +++ b/hwy/ops/scalar-inl.h @@ -1068,7 +1068,7 @@ HWY_API Mask1 IsFinite(const Vec1 v) { // ------------------------------ Load template > -HWY_API Vec1 Load(D /* tag */, const T* HWY_RESTRICT aligned) { +HWY_API Vec1 LoadU(D d, const T* HWY_RESTRICT aligned) { T t; CopySameSize(aligned, &t); return Vec1(t); @@ -1086,8 +1086,9 @@ HWY_API Vec1 MaskedLoadOr(Vec1 v, Mask1 m, D d, } template > -HWY_API Vec1 LoadU(D d, const T* HWY_RESTRICT p) { - return Load(d, p); +HWY_API Vec1 Load(D d, const T* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + return LoadU(d, p); } // In some use cases, "load single lane" is sufficient; otherwise avoid this. @@ -1117,13 +1118,17 @@ HWY_API VFromD LoadNOr(VFromD no, D d, const T* HWY_RESTRICT p, // ------------------------------ Store template > -HWY_API void Store(const Vec1 v, D /* tag */, T* HWY_RESTRICT aligned) { +HWY_API void StoreU(const Vec1 v, D d, T* HWY_RESTRICT aligned) { + (void)d; CopySameSize(&v.raw, aligned); } template > -HWY_API void StoreU(const Vec1 v, D d, T* HWY_RESTRICT p) { - return Store(v, d, p); +HWY_API void Store(const Vec1 v, D d, T* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + (void)d; + CopySameSize(&v.raw, p); + return StoreU(v, d, p); } template > diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h index fdb96fa7d2..2600c34200 100644 --- a/hwy/ops/wasm_128-inl.h +++ b/hwy/ops/wasm_128-inl.h @@ -1860,13 +1860,15 @@ HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { // ------------------------------ Load template > -HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { +HWY_API Vec128 Load(D d, const T* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); return Vec128{wasm_v128_load(aligned)}; } // Partial template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); VFromD v; CopyBytes(p, &v); return v; @@ -1939,24 +1941,27 @@ HWY_INLINE double ExtractLane(const Vec128 v) { } // namespace detail template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; wasm_v128_store(aligned, v.raw); } // Partial template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { +HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { CopyBytes(&v, p); } template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { +HWY_API void StoreU(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { *p = detail::ExtractLane<0>(v); } // StoreU == Store. template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); Store(v, d, p); } diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h index 8ac6836525..c07ebb240b 100644 --- a/hwy/ops/wasm_256-inl.h +++ b/hwy/ops/wasm_256-inl.h @@ -755,7 +755,7 @@ HWY_API Vec256 BroadcastSignBit(const Vec256 v) { // ------------------------------ Load template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { +HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT aligned) { const Half dh; VFromD ret; ret.v0 = Load(dh, aligned); @@ -776,8 +776,9 @@ HWY_API Vec256 MaskedLoadOr(Vec256 v, Mask256 m, D d, // LoadU == Load. template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + return LoadU(d, p); } template @@ -791,16 +792,18 @@ HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { // ------------------------------ Store template > -HWY_API void Store(Vec256 v, D d, T* HWY_RESTRICT aligned) { +HWY_API void StoreU(Vec256 v, D d, T* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); const Half dh; - Store(v.v0, dh, aligned); - Store(v.v1, dh, aligned + Lanes(dh)); + StoreU(v.v0, dh, aligned); + StoreU(v.v1, dh, aligned + Lanes(dh)); } // StoreU == Store. template > -HWY_API void StoreU(Vec256 v, D d, T* HWY_RESTRICT p) { - Store(v, d, p); +HWY_API void Store(Vec256 v, D d, T* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + StoreU(v, d, p); } template > diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index c112ba9ecc..3dfa76234a 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -1466,17 +1466,21 @@ HWY_API Vec128 ShiftRight(const Vec128 v) { // ------------------------------ Load template -HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return VFromD{_mm_load_si128(reinterpret_cast(aligned))}; } // Generic for all vector lengths greater than or equal to 16 bytes. template HWY_API VFromD Load(D d, const bfloat16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); const RebindToUnsigned du; return BitCast(d, Load(du, reinterpret_cast(aligned))); } template HWY_API Vec128 Load(D d, const float16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); #if HWY_HAVE_FLOAT16 return Vec128{_mm_load_ph(aligned)}; #else @@ -1485,11 +1489,15 @@ HWY_API Vec128 Load(D d, const float16_t* HWY_RESTRICT aligned) { #endif // HWY_HAVE_FLOAT16 } template -HWY_API Vec128 Load(D /* tag */, const float* HWY_RESTRICT aligned) { +HWY_API Vec128 Load(D d, const float* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return Vec128{_mm_load_ps(aligned)}; } template -HWY_API Vec128 Load(D /* tag */, const double* HWY_RESTRICT aligned) { +HWY_API Vec128 Load(D d, const double* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return Vec128{_mm_load_pd(aligned)}; } @@ -1523,7 +1531,7 @@ HWY_API Vec128 LoadU(D /* tag */, const double* HWY_RESTRICT p) { } template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { +HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t #if HWY_SAFE_PARTIAL_LOAD_STORE __m128i v = _mm_setzero_si128(); @@ -1535,7 +1543,7 @@ HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { } template -HWY_API Vec64 Load(D /* tag */, const float* HWY_RESTRICT p) { +HWY_API Vec64 LoadU(D /* tag */, const float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE __m128 v = _mm_setzero_ps(); CopyBytes<8>(p, &v); // not same size @@ -1547,7 +1555,7 @@ HWY_API Vec64 Load(D /* tag */, const float* HWY_RESTRICT p) { } template -HWY_API Vec64 Load(D /* tag */, const double* HWY_RESTRICT p) { +HWY_API Vec64 LoadU(D /* tag */, const double* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE __m128d v = _mm_setzero_pd(); CopyBytes<8>(p, &v); // not same size @@ -1558,7 +1566,7 @@ HWY_API Vec64 Load(D /* tag */, const double* HWY_RESTRICT p) { } template -HWY_API Vec32 Load(D /* tag */, const float* HWY_RESTRICT p) { +HWY_API Vec32 LoadU(D /* tag */, const float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE __m128 v = _mm_setzero_ps(); CopyBytes<4>(p, &v); // not same size @@ -1570,7 +1578,7 @@ HWY_API Vec32 Load(D /* tag */, const float* HWY_RESTRICT p) { // Any <= 32 bit except template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { +HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t // Clang ArgumentPromotionPass seems to break this code. We can unpoison // before SetTableIndices -> LoadU -> Load and the memory is poisoned again. @@ -1589,8 +1597,9 @@ HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { // For < 128 bit, LoadU == Load. template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + return LoadU(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. @@ -1602,17 +1611,21 @@ HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { // ------------------------------ Store template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); } // Generic for all vector lengths greater than or equal to 16 bytes. template HWY_API void Store(VFromD v, D d, bfloat16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); const RebindToUnsigned du; Store(BitCast(du, v), du, reinterpret_cast(aligned)); } template HWY_API void Store(Vec128 v, D d, float16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); #if HWY_HAVE_FLOAT16 (void)d; _mm_store_ph(aligned, v.raw); @@ -1622,12 +1635,15 @@ HWY_API void Store(Vec128 v, D d, float16_t* HWY_RESTRICT aligned) { #endif // HWY_HAVE_FLOAT16 } template -HWY_API void Store(Vec128 v, D /* tag */, float* HWY_RESTRICT aligned) { +HWY_API void Store(Vec128 v, D d, float* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm_store_ps(aligned, v.raw); } template -HWY_API void Store(Vec128 v, D /* tag */, - double* HWY_RESTRICT aligned) { +HWY_API void Store(Vec128 v, D d, double* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm_store_pd(aligned, v.raw); } @@ -1661,7 +1677,7 @@ HWY_API void StoreU(Vec128 v, D /* tag */, double* HWY_RESTRICT p) { } template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { +HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE (void)d; CopyBytes<8>(&v, p); // not same size @@ -1671,7 +1687,7 @@ HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { #endif } template -HWY_API void Store(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { +HWY_API void StoreU(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE CopyBytes<8>(&v, p); // not same size #else @@ -1679,7 +1695,7 @@ HWY_API void Store(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { #endif } template -HWY_API void Store(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { +HWY_API void StoreU(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE CopyBytes<8>(&v, p); // not same size #else @@ -1689,11 +1705,11 @@ HWY_API void Store(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { // Any <= 32 bit except template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { +HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { CopyBytes(&v, p); // not same size } template -HWY_API void Store(Vec32 v, D /* tag */, float* HWY_RESTRICT p) { +HWY_API void StoreU(Vec32 v, D /* tag */, float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE CopyBytes<4>(&v, p); // not same size #else @@ -1703,8 +1719,9 @@ HWY_API void Store(Vec32 v, D /* tag */, float* HWY_RESTRICT p) { // For < 128 bit, StoreU == Store. template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { + HWY_DASSERT_ALIGNED(d, p); + StoreU(v, d, p); } // ================================================== SWIZZLE (1) diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index 2d841a1095..781b57d20a 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -3025,13 +3025,16 @@ HWY_API Mask256 IsFinite(Vec256 v) { // ------------------------------ Load template -HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return VFromD{ _mm256_load_si256(reinterpret_cast(aligned))}; } // bfloat16_t is handled by x86_128-inl.h. template HWY_API Vec256 Load(D d, const float16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); #if HWY_HAVE_FLOAT16 (void)d; return Vec256{_mm256_load_ph(aligned)}; @@ -3041,11 +3044,15 @@ HWY_API Vec256 Load(D d, const float16_t* HWY_RESTRICT aligned) { #endif // HWY_HAVE_FLOAT16 } template -HWY_API Vec256 Load(D /* tag */, const float* HWY_RESTRICT aligned) { +HWY_API Vec256 Load(D d, const float* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return Vec256{_mm256_load_ps(aligned)}; } template -HWY_API Vec256 Load(D /* tag */, const double* HWY_RESTRICT aligned) { +HWY_API Vec256 Load(D d, const double* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return Vec256{_mm256_load_pd(aligned)}; } @@ -3244,11 +3251,14 @@ HWY_API Vec256 LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { // ------------------------------ Store template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw); } template HWY_API void Store(Vec256 v, D d, float16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); #if HWY_HAVE_FLOAT16 (void)d; _mm256_store_ph(aligned, v.raw); @@ -3258,12 +3268,15 @@ HWY_API void Store(Vec256 v, D d, float16_t* HWY_RESTRICT aligned) { #endif // HWY_HAVE_FLOAT16 } template -HWY_API void Store(Vec256 v, D /* tag */, float* HWY_RESTRICT aligned) { +HWY_API void Store(Vec256 v, D d, float* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm256_store_ps(aligned, v.raw); } template -HWY_API void Store(Vec256 v, D /* tag */, - double* HWY_RESTRICT aligned) { +HWY_API void Store(Vec256 v, D d, double* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm256_store_pd(aligned, v.raw); } diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index 85738aa4a8..e53a6e287a 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -2786,12 +2786,15 @@ HWY_API Mask512 IsFinite(Vec512 v) { // ------------------------------ Load template -HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { +HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return VFromD{_mm512_load_si512(aligned)}; } // bfloat16_t is handled by x86_128-inl.h. template HWY_API Vec512 Load(D d, const float16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); #if HWY_HAVE_FLOAT16 (void)d; return Vec512{_mm512_load_ph(aligned)}; @@ -2801,11 +2804,15 @@ HWY_API Vec512 Load(D d, const float16_t* HWY_RESTRICT aligned) { #endif // HWY_HAVE_FLOAT16 } template -HWY_API Vec512 Load(D /* tag */, const float* HWY_RESTRICT aligned) { +HWY_API Vec512 Load(D d, const float* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return Vec512{_mm512_load_ps(aligned)}; } template -HWY_API VFromD Load(D /* tag */, const double* HWY_RESTRICT aligned) { +HWY_API VFromD Load(D d, const double* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; return VFromD{_mm512_load_pd(aligned)}; } @@ -2939,13 +2946,17 @@ HWY_API VFromD LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { // ------------------------------ Store template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { +HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw); } // bfloat16_t is handled by x86_128-inl.h. template -HWY_API void Store(Vec512 v, D /* tag */, +HWY_API void Store(Vec512 v, D d, float16_t* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; #if HWY_HAVE_FLOAT16 _mm512_store_ph(aligned, v.raw); #else @@ -2953,11 +2964,15 @@ HWY_API void Store(Vec512 v, D /* tag */, #endif } template -HWY_API void Store(Vec512 v, D /* tag */, float* HWY_RESTRICT aligned) { +HWY_API void Store(Vec512 v, D d, float* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm512_store_ps(aligned, v.raw); } template -HWY_API void Store(VFromD v, D /* tag */, double* HWY_RESTRICT aligned) { +HWY_API void Store(VFromD v, D d, double* HWY_RESTRICT aligned) { + HWY_DASSERT_ALIGNED(d, aligned); + (void)d; _mm512_store_pd(aligned, v.raw); }