Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added StreamLoad op #2044

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion g3doc/quick_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1592,7 +1592,7 @@ offsets. If you have offsets, you can convert them to indices via `ShiftRight`.

### Cache control

All functions except `Stream` are defined in cache_control.h.
All functions except `Stream` and `StreamLoad` are defined in cache_control.h.

* <code>void **Stream**(Vec&lt;D&gt; a, D d, const T* aligned)</code>: copies
`a[i]` into `aligned[i]` with non-temporal hint if available (useful for
Expand All @@ -1604,6 +1604,10 @@ All functions except `Stream` are defined in cache_control.h.
can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be
visible until `FlushStream` is called.

* <code>Vec&lt;D&gt; **StreamLoad**(D, const T* aligned)</code>: returns
`aligned[i]` with non-temporal hint if available. May fault if the pointer
is not aligned to the vector size (using aligned_allocator.h is safe).

* <code>void **FlushStream**()</code>: ensures values written by previous
`Stream` calls are visible on the current core. This is NOT sufficient for
synchronizing across cores; when `Stream` outputs are to be consumed by
Expand Down
19 changes: 19 additions & 0 deletions hwy/ops/arm_sve-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1740,6 +1740,12 @@ HWY_API svbool_t IsFinite(const V v) {

// ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream

#ifdef HWY_NATIVE_STREAM_LOAD
#undef HWY_NATIVE_STREAM_LOAD
#else
#define HWY_NATIVE_STREAM_LOAD
#endif

#define HWY_SVE_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \
template <size_t N, int kPow2> \
HWY_API HWY_SVE_V(BASE, BITS) \
Expand Down Expand Up @@ -1768,6 +1774,13 @@ HWY_API svbool_t IsFinite(const V v) {
v); \
} \
template <size_t N, int kPow2> \
HWY_API HWY_SVE_V(BASE, BITS) \
StreamLoad(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
return svldnt1_##CHAR##BITS(detail::MakeMask(d), \
detail::NativeLanePointer(p)); \
} \
template <size_t N, int kPow2> \
HWY_API void BlendedStore(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
Expand Down Expand Up @@ -1797,6 +1810,12 @@ HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
MaskedLoad(RebindMask(du, m), du, detail::U16LanePointer(p)));
}

template <class D, HWY_SVE_IF_EMULATED_D(D)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT p) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, StreamLoad(du, detail::U16LanePointer(p)));
}

// MaskedLoadOr is generic and does not require emulation.

template <class D, HWY_SVE_IF_EMULATED_D(D)>
Expand Down
15 changes: 15 additions & 0 deletions hwy/ops/generic_ops-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1228,6 +1228,21 @@ HWY_API MFromD<D> IsFinite(const V v) {

#endif // HWY_NATIVE_ISINF

// ------------------------------ StreamLoad
#if (defined(HWY_NATIVE_STREAM_LOAD) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_STREAM_LOAD
#undef HWY_NATIVE_STREAM_LOAD
#else
#define HWY_NATIVE_STREAM_LOAD
#endif

template <class D>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
return Load(d, aligned);
}

#endif // HWY_NATIVE_STREAM_LOAD

// ------------------------------ LoadInterleaved2

#if HWY_IDE || \
Expand Down
24 changes: 24 additions & 0 deletions hwy/ops/ppc_vsx-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2132,6 +2132,30 @@ HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {

// ================================================== MEMORY (3)

// ------------------------------ Non-temporal loads

#if !HWY_S390X_HAVE_Z14

#ifdef HWY_NATIVE_STREAM_LOAD
#undef HWY_NATIVE_STREAM_LOAD
#else
#define HWY_NATIVE_STREAM_LOAD
#endif

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
return Load(d, aligned);
}

template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
unsigned char* HWY_RESTRICT p = const_cast<unsigned char*>(
reinterpret_cast<const unsigned char*>(HWY_ASSUME_ALIGNED(aligned, 16)));
return BitCast(d, Vec128<uint8_t>{vec_ldl(0, p)});
}

#endif // !HWY_S390X_HAVE_Z14

// ------------------------------ Non-temporal stores

template <class D>
Expand Down
35 changes: 35 additions & 0 deletions hwy/ops/x86_128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -5695,6 +5695,41 @@ HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {

// ================================================== MEMORY (3)

// ------------------------------ Non-temporal loads
#if HWY_TARGET <= HWY_SSE4

#ifdef HWY_NATIVE_STREAM_LOAD
#undef HWY_NATIVE_STREAM_LOAD
#else
#define HWY_NATIVE_STREAM_LOAD
#endif

template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
return Load(d, aligned);
}

template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
// Suppress the ignoring attributes warning that is generated by
// HWY_RCAST_ALIGNED(const __m128i*, aligned) with GCC
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
#endif

__m128i* HWY_RESTRICT p =
const_cast<__m128i*>(HWY_RCAST_ALIGNED(const __m128i*, aligned));

return BitCast(d, Vec128<uint8_t>{_mm_stream_load_si128(p)});

#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(pop)
#endif
}

#endif // HWY_TARGET <= HWY_SSE4

// ------------------------------ Non-temporal stores

// On clang6, we see incorrect code generated for _mm_stream_pi, so
Expand Down
20 changes: 20 additions & 0 deletions hwy/ops/x86_256-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3667,6 +3667,26 @@ HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m, D d,

#endif

// ------------------------------ Non-temporal loads
template <class D, HWY_IF_V_SIZE_D(D, 32)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
// Suppress the ignoring attributes warning that is generated by
// HWY_RCAST_ALIGNED(const __m256i*, aligned) with GCC
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
#endif

__m256i* HWY_RESTRICT p =
const_cast<__m256i*>(HWY_RCAST_ALIGNED(const __m256i*, aligned));

return BitCast(d, Vec256<uint8_t>{_mm256_stream_load_si256(p)});

#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(pop)
#endif
}

// ------------------------------ Non-temporal stores

template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT3264_D(D)>
Expand Down
20 changes: 20 additions & 0 deletions hwy/ops/x86_512-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3293,6 +3293,26 @@ HWY_API void BlendedStore(Vec512<double> v, Mask512<double> m, D /* tag */,
_mm512_mask_storeu_pd(p, m.raw, v.raw);
}

// ------------------------------ Non-temporal loads
template <class D, HWY_IF_V_SIZE_D(D, 64)>
HWY_API VFromD<D> StreamLoad(D d, const TFromD<D>* HWY_RESTRICT aligned) {
// Suppress the ignoring attributes warning that is generated by
// HWY_RCAST_ALIGNED(const __m512i*, aligned) with GCC
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
#endif

__m512i* HWY_RESTRICT p =
const_cast<__m512i*>(HWY_RCAST_ALIGNED(const __m512i*, aligned));

return BitCast(d, Vec512<uint8_t>{_mm512_stream_load_si512(p)});

#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(pop)
#endif
}

// ------------------------------ Non-temporal stores

template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT3264_D(D)>
Expand Down
32 changes: 32 additions & 0 deletions hwy/tests/memory_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,37 @@ HWY_NOINLINE void TestAllStream() {
ForFloatTypes(test);
}

struct TestStreamLoad {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);

const size_t affected_bytes = (N * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
~size_t(HWY_STREAM_MULTIPLE - 1);
const size_t affected_lanes = affected_bytes / sizeof(T);

auto in_lanes = AllocateAligned<T>(2 * affected_lanes);
HWY_ASSERT(in_lanes);

ZeroBytes(in_lanes.get(), 2 * affected_lanes * sizeof(T));
Stream(PositiveIota(d), d, in_lanes.get());
FlushStream();
const Vec<D> actual = StreamLoad(d, in_lanes.get());
HWY_ASSERT_VEC_EQ(d, in_lanes.get(), actual);
}
};

HWY_NOINLINE void TestAllStreamLoad() {
const ForPartialVectors<TestStreamLoad> test;
// No u8,u16.
test(uint32_t());
test(uint64_t());
// No i8,i16.
test(int32_t());
test(int64_t());
ForFloatTypes(test);
}

// Assumes little-endian byte order!
struct TestScatter {
template <class T, class D>
Expand Down Expand Up @@ -574,6 +605,7 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStreamLoad);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
Expand Down
Loading