From 7c5afa567700dce9f06734205ebb4ea1431e7cc6 Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Wed, 6 Nov 2024 14:19:30 +0100
Subject: [PATCH] Experiment with simd_masked_load to read beyond without
 undefined behavior

---
 src/gxhash/platform/arm.rs | 18 +++++++++++-------
 src/gxhash/platform/mod.rs | 30 ++++++++++++++++--------------
 src/gxhash/platform/x86.rs | 16 ++++++++++------
 src/lib.rs                 |  1 +
 4 files changed, 38 insertions(+), 27 deletions(-)
diff --git a/src/gxhash/platform/arm.rs b/src/gxhash/platform/arm.rs
index fc40b92..9ff8126 100644
--- a/src/gxhash/platform/arm.rs
+++ b/src/gxhash/platform/arm.rs
@@ -27,13 +27,17 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
 
 #[inline(always)]
 pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
-    // Temporary buffer filled with zeros
-    let mut buffer = [0i8; VECTOR_SIZE];
-    // Copy data into the buffer
-    core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
-    // Load the buffer into a __m256i vector
-    let partial_vector = vld1q_s8(buffer.as_ptr());
-    vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
+    // // Temporary buffer filled with zeros
+    // let mut buffer = [0i8; VECTOR_SIZE];
+    // // Copy data into the buffer
+    // core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
+    // // Load the buffer into a __m256i vector
+    // let partial_vector = vld1q_s8(buffer.as_ptr());
+    // vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
+
+    let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
+    let mask = vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(len as i8), indices));
+    std::intrinsics::simd::simd_masked_load(mask, data as *const i8, vdupq_n_s8(len as i8))
 }
 
 #[inline(always)]
diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs
index f40d676..a805b59 100644
--- a/src/gxhash/platform/mod.rs
+++ b/src/gxhash/platform/mod.rs
@@ -12,27 +12,29 @@ use core::mem::size_of;
 
 pub(crate) const VECTOR_SIZE: usize = size_of::<State>();
 // 4KiB is the default page size for most systems, and conservative for other systems such as macOS ARM (16KiB)
-const PAGE_SIZE: usize = 0x1000;
+// const PAGE_SIZE: usize = 0x1000;
 
 #[inline(always)]
 pub unsafe fn get_partial(p: *const State, len: usize) -> State {
     // Safety check
-    if check_same_page(p) {
-        get_partial_unsafe(p, len)
-    } else {
-        get_partial_safe(p, len)
-    }
-}
+    // if check_same_page(p) {
+    //     get_partial_unsafe(p, len)
+    // } else {
+    //     get_partial_safe(p, len)
+    // }
 
-#[inline(always)]
-unsafe fn check_same_page(ptr: *const State) -> bool {
-    let address = ptr as usize;
-    // Mask to keep only the last 12 bits
-    let offset_within_page = address & (PAGE_SIZE - 1);
-    // Check if the 16th byte from the current offset exceeds the page boundary
-    offset_within_page < PAGE_SIZE - VECTOR_SIZE
+    get_partial_safe(p, len)
 }
 
+// #[inline(always)]
+// unsafe fn check_same_page(ptr: *const State) -> bool {
+//     let address = ptr as usize;
+//     // Mask to keep only the last 12 bits
+//     let offset_within_page = address & (PAGE_SIZE - 1);
+//     // Check if the 16th byte from the current offset exceeds the page boundary
+//     offset_within_page < PAGE_SIZE - VECTOR_SIZE
+// }
+
 #[inline(always)]
 pub unsafe fn finalize(hash: State) -> State {
     let mut hash = aes_encrypt(hash, ld(KEYS.as_ptr()));
diff --git a/src/gxhash/platform/x86.rs b/src/gxhash/platform/x86.rs
index a5735f1..7925c3b 100644
--- a/src/gxhash/platform/x86.rs
+++ b/src/gxhash/platform/x86.rs
@@ -31,12 +31,16 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
 #[inline(always)]
 pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
-    let mut buffer = [0i8; VECTOR_SIZE];
-    // Copy data into the buffer
-    core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
-    // Load the buffer into a __m256i vector
-    let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
-    _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
+    // let mut buffer = [0i8; VECTOR_SIZE];
+    // // Copy data into the buffer
+    // core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
+    // // Load the buffer into a __m256i vector
+    // let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
+    // _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
+
+    let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
+    std::intrinsics::simd::simd_masked_load(mask, data as *const i8, _mm_set1_epi8(len as i8))
 }
 
 #[inline(always)]
diff --git a/src/lib.rs b/src/lib.rs
index 705a0bf..785d2d1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+#![feature(core_intrinsics)]
 #![cfg_attr(not(feature = "std"), no_std)]
 // Hybrid SIMD width usage currently requires unstable 'stdsimd'
 #![cfg_attr(feature = "hybrid", feature(stdarch_x86_avx512))]