diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index bb9fb3872e..ecb2996810 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -1592,7 +1592,7 @@ offsets. If you have offsets, you can convert them to indices via `ShiftRight`.
### Cache control
-All functions except `Stream` are defined in cache_control.h.
+All functions except `Stream` and `StreamLoad` are defined in cache_control.h.
* void **Stream**(Vec<D> a, D d, const T* aligned)
: copies
`a[i]` into `aligned[i]` with non-temporal hint if available (useful for
@@ -1604,6 +1604,10 @@ All functions except `Stream` are defined in cache_control.h.
can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be
visible until `FlushStream` is called.
+* Vec<D> **StreamLoad**(D, const T* aligned)
: returns
+ `aligned[i]` with non-temporal hint if available. May fault if the pointer
+ is not aligned to the vector size (using aligned_allocator.h is safe).
+
* void **FlushStream**()
: ensures values written by previous
`Stream` calls are visible on the current core. This is NOT sufficient for
synchronizing across cores; when `Stream` outputs are to be consumed by
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 3b6e89c25f..d4377e75fa 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -1740,6 +1740,12 @@ HWY_API svbool_t IsFinite(const V v) {
// ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
template \
HWY_API HWY_SVE_V(BASE, BITS) \
@@ -1768,6 +1774,13 @@ HWY_API svbool_t IsFinite(const V v) {
v); \
} \
template \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ StreamLoad(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
+ const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
+ return svldnt1_##CHAR##BITS(detail::MakeMask(d), \
+ detail::NativeLanePointer(p)); \
+ } \
+ template \
HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
@@ -1797,6 +1810,12 @@ HWY_API VFromD MaskedLoad(MFromD m, D d,
MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
}
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT p) {
+ const RebindToUnsigned du;
+ return BitCast(d, StreamLoad(du, detail::U16LanePointer(p)));
+}
+
// MaskedLoadOr is generic and does not require emulation.
template
diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
index 041c5721f8..dd98e6e86f 100644
--- a/hwy/ops/generic_ops-inl.h
+++ b/hwy/ops/generic_ops-inl.h
@@ -1228,6 +1228,21 @@ HWY_API MFromD IsFinite(const V v) {
#endif // HWY_NATIVE_ISINF
+// ------------------------------ StreamLoad
+#if (defined(HWY_NATIVE_STREAM_LOAD) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+ return Load(d, aligned);
+}
+
+#endif // HWY_NATIVE_STREAM_LOAD
+
// ------------------------------ LoadInterleaved2
#if HWY_IDE || \
diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h
index 2d8c79dab2..0fe6dec038 100644
--- a/hwy/ops/ppc_vsx-inl.h
+++ b/hwy/ops/ppc_vsx-inl.h
@@ -2132,6 +2132,30 @@ HWY_API Vec128 operator%(Vec128 a, Vec128 b) {
// ================================================== MEMORY (3)
+// ------------------------------ Non-temporal loads
+
+#if !HWY_S390X_HAVE_Z14
+
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+ return Load(d, aligned);
+}
+
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+ unsigned char* HWY_RESTRICT p = const_cast(
+ reinterpret_cast(HWY_ASSUME_ALIGNED(aligned, 16)));
+ return BitCast(d, Vec128{vec_ldl(0, p)});
+}
+
+#endif // !HWY_S390X_HAVE_Z14
+
// ------------------------------ Non-temporal stores
template
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index 2520c0a175..945e8307af 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -5695,6 +5695,41 @@ HWY_API Vec128 Max(Vec128 a, Vec128 b) {
// ================================================== MEMORY (3)
+// ------------------------------ Non-temporal loads
+#if HWY_TARGET <= HWY_SSE4
+
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+ return Load(d, aligned);
+}
+
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+// Suppress the ignoring attributes warning that is generated by
+// HWY_RCAST_ALIGNED(const __m128i*, aligned) with GCC
+#if HWY_COMPILER_GCC
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
+#endif
+
+ __m128i* HWY_RESTRICT p =
+ const_cast<__m128i*>(HWY_RCAST_ALIGNED(const __m128i*, aligned));
+
+ return BitCast(d, Vec128{_mm_stream_load_si128(p)});
+
+#if HWY_COMPILER_GCC
+ HWY_DIAGNOSTICS(pop)
+#endif
+}
+
+#endif // HWY_TARGET <= HWY_SSE4
+
// ------------------------------ Non-temporal stores
// On clang6, we see incorrect code generated for _mm_stream_pi, so
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index fb6114f76e..a67b14acbf 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -3667,6 +3667,26 @@ HWY_API void BlendedStore(Vec256 v, Mask256 m, D d,
#endif
+// ------------------------------ Non-temporal loads
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+// Suppress the ignoring attributes warning that is generated by
+// HWY_RCAST_ALIGNED(const __m256i*, aligned) with GCC
+#if HWY_COMPILER_GCC
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
+#endif
+
+ __m256i* HWY_RESTRICT p =
+ const_cast<__m256i*>(HWY_RCAST_ALIGNED(const __m256i*, aligned));
+
+ return BitCast(d, Vec256{_mm256_stream_load_si256(p)});
+
+#if HWY_COMPILER_GCC
+ HWY_DIAGNOSTICS(pop)
+#endif
+}
+
// ------------------------------ Non-temporal stores
template
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index ae7b6a15d5..c2b9f5a62b 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -3293,6 +3293,26 @@ HWY_API void BlendedStore(Vec512 v, Mask512 m, D /* tag */,
_mm512_mask_storeu_pd(p, m.raw, v.raw);
}
+// ------------------------------ Non-temporal loads
+template
+HWY_API VFromD StreamLoad(D d, const TFromD* HWY_RESTRICT aligned) {
+// Suppress the ignoring attributes warning that is generated by
+// HWY_RCAST_ALIGNED(const __m512i*, aligned) with GCC
+#if HWY_COMPILER_GCC
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
+#endif
+
+ __m512i* HWY_RESTRICT p =
+ const_cast<__m512i*>(HWY_RCAST_ALIGNED(const __m512i*, aligned));
+
+ return BitCast(d, Vec512{_mm512_stream_load_si512(p)});
+
+#if HWY_COMPILER_GCC
+ HWY_DIAGNOSTICS(pop)
+#endif
+}
+
// ------------------------------ Non-temporal stores
template
diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc
index 6fb4cc4f41..d95d16dc9e 100644
--- a/hwy/tests/memory_test.cc
+++ b/hwy/tests/memory_test.cc
@@ -198,6 +198,37 @@ HWY_NOINLINE void TestAllStream() {
ForFloatTypes(test);
}
+struct TestStreamLoad {
+ template
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+
+ const size_t affected_bytes = (N * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
+ ~size_t(HWY_STREAM_MULTIPLE - 1);
+ const size_t affected_lanes = affected_bytes / sizeof(T);
+
+ auto in_lanes = AllocateAligned(2 * affected_lanes);
+ HWY_ASSERT(in_lanes);
+
+ ZeroBytes(in_lanes.get(), 2 * affected_lanes * sizeof(T));
+ Stream(PositiveIota(d), d, in_lanes.get());
+ FlushStream();
+ const Vec actual = StreamLoad(d, in_lanes.get());
+ HWY_ASSERT_VEC_EQ(d, in_lanes.get(), actual);
+ }
+};
+
+HWY_NOINLINE void TestAllStreamLoad() {
+ const ForPartialVectors test;
+ // No u8,u16.
+ test(uint32_t());
+ test(uint64_t());
+ // No i8,i16.
+ test(int32_t());
+ test(int64_t());
+ ForFloatTypes(test);
+}
+
// Assumes little-endian byte order!
struct TestScatter {
template
@@ -574,6 +605,7 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStreamLoad);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);