google · johnplatts · Mar 29, 2024
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
@@ -1592,7 +1592,7 @@ offsets. If you have offsets, you can convert them to indices via `ShiftRight`.
 
 ### Cache control
 
-All functions except `Stream` are defined in cache_control.h.
+All functions except `Stream` and `StreamLoad` are defined in cache_control.h.
 
 *   <code>void **Stream**(Vec&lt;D&gt; a, D d, const T* aligned)</code>: copies
     `a[i]` into `aligned[i]` with non-temporal hint if available (useful for
@@ -1604,6 +1604,10 @@ All functions except `Stream` are defined in cache_control.h.
     can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be
     visible until `FlushStream` is called.
 
+*   <code>Vec&lt;D&gt; **StreamLoad**(D, const T* aligned)</code>: returns
+    `aligned[i]` with non-temporal hint if available. May fault if the pointer
+    is not aligned to the vector size (using aligned_allocator.h is safe).
+
 *   <code>void **FlushStream**()</code>: ensures values written by previous
     `Stream` calls are visible on the current core. This is NOT sufficient for
     synchronizing across cores; when `Stream` outputs are to be consumed by

diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
@@ -1740,6 +1740,12 @@ HWY_API svbool_t IsFinite(const V v) {
 
 // ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream
 
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
 #define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP)                         \
   template <size_t N, int kPow2>                                              \
   HWY_API HWY_SVE_V(BASE, BITS)                                               \
@@ -1768,6 +1774,13 @@ HWY_API svbool_t IsFinite(const V v) {
                          v);                                                  \
   }                                                                           \
   template <size_t N, int kPow2>                                              \
+  HWY_API HWY_SVE_V(BASE, BITS)                                               \
+      StreamLoad(HWY_SVE_D(BASE, BITS, N, kPow2) d,                           \
+                 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) {              \
+    return svldnt1_##CHAR##BITS(detail::MakeMask(d),                          \
+                                detail::NativeLanePointer(p));                \
+  }                                                                           \
+  template <size_t N, int kPow2>                                              \
   HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m,              \
                             HWY_SVE_D(BASE, BITS, N, kPow2) /* d */,          \
                             HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) {         \
@@ -1797,6 +1810,12 @@ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
                  MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
 }
 
+template <class D, HWY_SVE_IF_EMULATED_D(D)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT p) {
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, StreamLoad(du, detail::U16LanePointer(p)));
+}
+
 // MaskedLoadOr is generic and does not require emulation.
 
 template <class D, HWY_SVE_IF_EMULATED_D(D)>

diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
@@ -1228,6 +1228,21 @@ HWY_API MFromD<D> IsFinite(const V v) {
 
 #endif  // HWY_NATIVE_ISINF
 
+// ------------------------------ StreamLoad
+#if (defined(HWY_NATIVE_STREAM_LOAD) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
+template <class D>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+  return Load(d, aligned);
+}
+
+#endif  // HWY_NATIVE_STREAM_LOAD
+
 // ------------------------------ LoadInterleaved2
 
 #if HWY_IDE || \

diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h
@@ -2132,6 +2132,30 @@ HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
 
 // ================================================== MEMORY (3)
 
+// ------------------------------ Non-temporal loads
+
+#if !HWY_S390X_HAVE_Z14
+
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+  return Load(d, aligned);
+}
+
+template <class D, HWY_IF_V_SIZE_D(D, 16)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+  unsigned char* HWY_RESTRICT p = const_cast<unsigned char*>(
+      reinterpret_cast<const unsigned char*>(HWY_ASSUME_ALIGNED(aligned, 16)));
+  return BitCast(d, Vec128<uint8_t>{vec_ldl(0, p)});
+}
+
+#endif  // !HWY_S390X_HAVE_Z14
+
 // ------------------------------ Non-temporal stores
 
 template <class D>

diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
@@ -5695,6 +5695,41 @@ HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
 
 // ================================================== MEMORY (3)
 
+// ------------------------------ Non-temporal loads
+#if HWY_TARGET <= HWY_SSE4
+
+#ifdef HWY_NATIVE_STREAM_LOAD
+#undef HWY_NATIVE_STREAM_LOAD
+#else
+#define HWY_NATIVE_STREAM_LOAD
+#endif
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+  return Load(d, aligned);
+}
+
+template <class D, HWY_IF_V_SIZE_D(D, 16)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+// Suppress the ignoring attributes warning that is generated by
+// HWY_RCAST_ALIGNED(const __m128i*, aligned) with GCC
+#if HWY_COMPILER_GCC
+  HWY_DIAGNOSTICS(push)
+  HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
+#endif
+
+  __m128i* HWY_RESTRICT p =
+      const_cast<__m128i*>(HWY_RCAST_ALIGNED(const __m128i*, aligned));
+
+  return BitCast(d, Vec128<uint8_t>{_mm_stream_load_si128(p)});
+
+#if HWY_COMPILER_GCC
+  HWY_DIAGNOSTICS(pop)
+#endif
+}
+
+#endif  // HWY_TARGET <= HWY_SSE4
+
 // ------------------------------ Non-temporal stores
 
 // On clang6, we see incorrect code generated for _mm_stream_pi, so

diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
@@ -3667,6 +3667,26 @@ HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m, D d,
 
 #endif
 
+// ------------------------------ Non-temporal loads
+template <class D, HWY_IF_V_SIZE_D(D, 32)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+// Suppress the ignoring attributes warning that is generated by
+// HWY_RCAST_ALIGNED(const __m256i*, aligned) with GCC
+#if HWY_COMPILER_GCC
+  HWY_DIAGNOSTICS(push)
+  HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
+#endif
+
+  __m256i* HWY_RESTRICT p =
+      const_cast<__m256i*>(HWY_RCAST_ALIGNED(const __m256i*, aligned));
+
+  return BitCast(d, Vec256<uint8_t>{_mm256_stream_load_si256(p)});
+
+#if HWY_COMPILER_GCC
+  HWY_DIAGNOSTICS(pop)
+#endif
+}
+
 // ------------------------------ Non-temporal stores
 
 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>

diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
@@ -3293,6 +3293,26 @@ HWY_API void BlendedStore(Vec512<double> v, Mask512<double> m, D /* tag */,
   _mm512_mask_storeu_pd(p, m.raw, v.raw);
 }
 
+// ------------------------------ Non-temporal loads
+template <class D, HWY_IF_V_SIZE_D(D, 64)>
+HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
+// Suppress the ignoring attributes warning that is generated by
+// HWY_RCAST_ALIGNED(const __m512i*, aligned) with GCC
+#if HWY_COMPILER_GCC
+  HWY_DIAGNOSTICS(push)
+  HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
+#endif
+
+  __m512i* HWY_RESTRICT p =
+      const_cast<__m512i*>(HWY_RCAST_ALIGNED(const __m512i*, aligned));
+
+  return BitCast(d, Vec512<uint8_t>{_mm512_stream_load_si512(p)});
+
+#if HWY_COMPILER_GCC
+  HWY_DIAGNOSTICS(pop)
+#endif
+}
+
 // ------------------------------ Non-temporal stores
 
 template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>

diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc
@@ -198,6 +198,37 @@ HWY_NOINLINE void TestAllStream() {
   ForFloatTypes(test);
 }
 
+struct TestStreamLoad {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    const size_t affected_bytes = (N * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
+                                  ~size_t(HWY_STREAM_MULTIPLE - 1);
+    const size_t affected_lanes = affected_bytes / sizeof(T);
+
+    auto in_lanes = AllocateAligned<T>(2 * affected_lanes);
+    HWY_ASSERT(in_lanes);
+
+    ZeroBytes(in_lanes.get(), 2 * affected_lanes * sizeof(T));
+    Stream(PositiveIota(d), d, in_lanes.get());
+    FlushStream();
+    const Vec<D> actual = StreamLoad(d, in_lanes.get());
+    HWY_ASSERT_VEC_EQ(d, in_lanes.get(), actual);
+  }
+};
+
+HWY_NOINLINE void TestAllStreamLoad() {
+  const ForPartialVectors<TestStreamLoad> test;
+  // No u8,u16.
+  test(uint32_t());
+  test(uint64_t());
+  // No i8,i16.
+  test(int32_t());
+  test(int64_t());
+  ForFloatTypes(test);
+}
+
 // Assumes little-endian byte order!
 struct TestScatter {
   template <class T, class D>
@@ -574,6 +605,7 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStreamLoad);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);