diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index bb9fb3872e..ecb2996810 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1592,7 +1592,7 @@ offsets. If you have offsets, you can convert them to indices via `ShiftRight`. ### Cache control -All functions except `Stream` are defined in cache_control.h. +All functions except `Stream` and `StreamLoad` are defined in cache_control.h. * void **Stream**(Vec<D> a, D d, const T* aligned): copies `a[i]` into `aligned[i]` with non-temporal hint if available (useful for @@ -1604,6 +1604,10 @@ All functions except `Stream` are defined in cache_control.h. can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be visible until `FlushStream` is called. +* Vec<D> **StreamLoad**(D, const T* aligned): returns + `aligned[i]` with non-temporal hint if available. May fault if the pointer + is not aligned to the vector size (using aligned_allocator.h is safe). + * void **FlushStream**(): ensures values written by previous `Stream` calls are visible on the current core. This is NOT sufficient for synchronizing across cores; when `Stream` outputs are to be consumed by diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 3b6e89c25f..d4377e75fa 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -1740,6 +1740,12 @@ HWY_API svbool_t IsFinite(const V v) { // ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream +#ifdef HWY_NATIVE_STREAM_LOAD +#undef HWY_NATIVE_STREAM_LOAD +#else +#define HWY_NATIVE_STREAM_LOAD +#endif + #define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \ template \ HWY_API HWY_SVE_V(BASE, BITS) \ @@ -1768,6 +1774,13 @@ HWY_API svbool_t IsFinite(const V v) { v); \ } \ template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + StreamLoad(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + return svldnt1_##CHAR##BITS(detail::MakeMask(d), \ + detail::NativeLanePointer(p)); \ + } \ + template \ HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \ HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ @@ -1797,6 +1810,12 @@ HWY_API VFromD MaskedLoad(MFromD m, D d, MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p))); } +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT p) { + const RebindToUnsigned du; + return BitCast(d, StreamLoad(du, detail::U16LanePointer(p))); +} + // MaskedLoadOr is generic and does not require emulation. template diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 041c5721f8..dd98e6e86f 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -1228,6 +1228,21 @@ HWY_API MFromD IsFinite(const V v) { #endif // HWY_NATIVE_ISINF +// ------------------------------ StreamLoad +#if (defined(HWY_NATIVE_STREAM_LOAD) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_STREAM_LOAD +#undef HWY_NATIVE_STREAM_LOAD +#else +#define HWY_NATIVE_STREAM_LOAD +#endif + +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +#endif // HWY_NATIVE_STREAM_LOAD + // ------------------------------ LoadInterleaved2 #if HWY_IDE || \ diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index 2d8c79dab2..0fe6dec038 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -2132,6 +2132,30 @@ HWY_API Vec128 operator%(Vec128 a, Vec128 b) { // ================================================== MEMORY (3) +// ------------------------------ Non-temporal loads + +#if !HWY_S390X_HAVE_Z14 + +#ifdef HWY_NATIVE_STREAM_LOAD +#undef HWY_NATIVE_STREAM_LOAD +#else +#define HWY_NATIVE_STREAM_LOAD +#endif + +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { + unsigned char* HWY_RESTRICT p = const_cast( + reinterpret_cast(HWY_ASSUME_ALIGNED(aligned, 16))); + return BitCast(d, Vec128{vec_ldl(0, p)}); +} + +#endif // !HWY_S390X_HAVE_Z14 + // ------------------------------ Non-temporal stores template diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index 2520c0a175..945e8307af 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -5695,6 +5695,41 @@ HWY_API Vec128 Max(Vec128 a, Vec128 b) { // ================================================== MEMORY (3) +// ------------------------------ Non-temporal loads +#if HWY_TARGET <= HWY_SSE4 + +#ifdef HWY_NATIVE_STREAM_LOAD +#undef HWY_NATIVE_STREAM_LOAD +#else +#define HWY_NATIVE_STREAM_LOAD +#endif + +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { + return Load(d, aligned); +} + +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { +// Suppress the ignoring attributes warning that is generated by +// HWY_RCAST_ALIGNED(const __m128i*, aligned) with GCC +#if HWY_COMPILER_GCC + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") +#endif + + __m128i* HWY_RESTRICT p = + const_cast<__m128i*>(HWY_RCAST_ALIGNED(const __m128i*, aligned)); + + return BitCast(d, Vec128{_mm_stream_load_si128(p)}); + +#if HWY_COMPILER_GCC + HWY_DIAGNOSTICS(pop) +#endif +} + +#endif // HWY_TARGET <= HWY_SSE4 + // ------------------------------ Non-temporal stores // On clang6, we see incorrect code generated for _mm_stream_pi, so diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index fb6114f76e..a67b14acbf 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -3667,6 +3667,26 @@ HWY_API void BlendedStore(Vec256 v, Mask256 m, D d, #endif +// ------------------------------ Non-temporal loads +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { +// Suppress the ignoring attributes warning that is generated by +// HWY_RCAST_ALIGNED(const __m256i*, aligned) with GCC +#if HWY_COMPILER_GCC + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") +#endif + + __m256i* HWY_RESTRICT p = + const_cast<__m256i*>(HWY_RCAST_ALIGNED(const __m256i*, aligned)); + + return BitCast(d, Vec256{_mm256_stream_load_si256(p)}); + +#if HWY_COMPILER_GCC + HWY_DIAGNOSTICS(pop) +#endif +} + // ------------------------------ Non-temporal stores template diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index ae7b6a15d5..c2b9f5a62b 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -3293,6 +3293,26 @@ HWY_API void BlendedStore(Vec512 v, Mask512 m, D /* tag */, _mm512_mask_storeu_pd(p, m.raw, v.raw); } +// ------------------------------ Non-temporal loads +template +HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) { +// Suppress the ignoring attributes warning that is generated by +// HWY_RCAST_ALIGNED(const __m512i*, aligned) with GCC +#if HWY_COMPILER_GCC + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes") +#endif + + __m512i* HWY_RESTRICT p = + const_cast<__m512i*>(HWY_RCAST_ALIGNED(const __m512i*, aligned)); + + return BitCast(d, Vec512{_mm512_stream_load_si512(p)}); + +#if HWY_COMPILER_GCC + HWY_DIAGNOSTICS(pop) +#endif +} + // ------------------------------ Non-temporal stores template diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc index 6fb4cc4f41..d95d16dc9e 100644 --- a/hwy/tests/memory_test.cc +++ b/hwy/tests/memory_test.cc @@ -198,6 +198,37 @@ HWY_NOINLINE void TestAllStream() { ForFloatTypes(test); } +struct TestStreamLoad { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + const size_t affected_bytes = (N * sizeof(T) + HWY_STREAM_MULTIPLE - 1) & + ~size_t(HWY_STREAM_MULTIPLE - 1); + const size_t affected_lanes = affected_bytes / sizeof(T); + + auto in_lanes = AllocateAligned(2 * affected_lanes); + HWY_ASSERT(in_lanes); + + ZeroBytes(in_lanes.get(), 2 * affected_lanes * sizeof(T)); + Stream(PositiveIota(d), d, in_lanes.get()); + FlushStream(); + const Vec actual = StreamLoad(d, in_lanes.get()); + HWY_ASSERT_VEC_EQ(d, in_lanes.get(), actual); + } +}; + +HWY_NOINLINE void TestAllStreamLoad() { + const ForPartialVectors test; + // No u8,u16. + test(uint32_t()); + test(uint64_t()); + // No i8,i16. + test(int32_t()); + test(int64_t()); + ForFloatTypes(test); +} + // Assumes little-endian byte order! struct TestScatter { template @@ -574,6 +605,7 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStreamLoad); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);