Read partial vector first

ogxd · Nov 17, 2023 · 309fd66 · 309fd66
1 parent 69b7c7c
commit 309fd66
Show file tree

Hide file tree

Showing 9 changed files with 140 additions and 116 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -20,6 +20,8 @@ jobs:
     - name: Switch to nightly
       run: rustup default nightly
     - name: Build
-      run: cargo build --verbose
+      run: cargo build --release
     - name: Run tests
-      run: cargo test --verbose
+      run: cargo test --release
+    - name: Benchmark
+      run: cargo bench --bench throughput
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "gxhash"
-author = "Olivier Giniaux"
-version = "1.1.1"
+authors = ["Olivier Giniaux"]
+version = "2.0.0"
 edition = "2021"
 description = "GxHash non-cryptographic algorithm"
 license = "MIT"

diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs
@@ -13,7 +13,7 @@ use rand::Rng;
 use gxhash::*;
 
 const ITERATIONS: u32 = 1000;
-const MAX_RUN_DURATION: Duration = Duration::from_millis(500);
+const MAX_RUN_DURATION: Duration = Duration::from_millis(1000);
 const FORCE_NO_INLINING: bool = false;
 
 fn main() {
@@ -31,13 +31,13 @@ fn main() {
 
     // GxHash
     let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" };
-    benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i32| -> u64 {
+    benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i64| -> u64 {
         gxhash64(data, seed)
     });
 
     // XxHash (twox-hash)
-    benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: i32| -> u64 {
-        twox_hash::xxh3::hash64_with_seed(data, seed as u64)
+    benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: u64| -> u64 {
+        twox_hash::xxh3::hash64_with_seed(data, seed)
     });
 
     // AHash
@@ -47,13 +47,13 @@ fn main() {
     });
 
     // T1ha0
-    benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: i32| -> u64 {
-        t1ha::t1ha0(data, seed as u64)
+    benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: u64| -> u64 {
+        t1ha::t1ha0(data, seed)
     });
 
     // SeaHash
-    benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: i32| -> u64 {
-        seahash::hash_seeded(data, seed as u64, 0, 0, 0)
+    benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: u64| -> u64 {
+        seahash::hash_seeded(data, seed, 0, 0, 0)
     });
 
     // MetroHash
@@ -70,8 +70,8 @@ fn main() {
     });
 
     // FNV-1a
-    benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: i32| -> u64 {
-        let mut fnv_hasher = fnv::FnvHasher::with_key(seed as u64);
+    benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: u64| -> u64 {
+        let mut fnv_hasher = fnv::FnvHasher::with_key(seed);
         fnv_hasher.write(data);
         fnv_hasher.finish()
     });
@@ -80,8 +80,8 @@ fn main() {
     unsafe { dealloc(ptr, layout) };
 }
 
-fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
-    where F: Fn(&[u8], i32) -> u64
+fn benchmark<F, S>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
+    where F: Fn(&[u8], S) -> u64, S: Default + TryFrom<u128> + TryInto<usize>
 {
     processor.on_start(name);
     for i in 2.. {
@@ -91,21 +91,23 @@ fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delega
         }
 
         // Warmup
-        black_box(time(ITERATIONS, &|| delegate(&data[..len], 0))); 
+        black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default()))); 
 
         let mut total_duration: Duration = Duration::ZERO;
         let mut runs: usize = 0;
         let now = Instant::now();
         while now.elapsed() < MAX_RUN_DURATION {
-            // Prevent optimizations from predictable seed
-            let seed = total_duration.as_nanos() as i32;
-            // Prevent optimizations from predictable slice
-            // Also makes the benchmark use both aligned on unaligned data
-            let start = seed as usize & 0xFF;
+            // Make seed unpredictable to prevent optimizations
+            let seed = S::try_from(total_duration.as_nanos())
+                .unwrap_or_else(|_| panic!("Something went horribly wrong!"));
+            // Offset slice by an unpredictable amount to prevent optimization (pre caching)
+            // and make the benchmark use both aligned and unaligned data
+            let start = S::try_into(seed)
+                .unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF;
             let end = start + len;
             let slice = &data[start..end];
             // Execute method for a new iterations
-            total_duration += time(ITERATIONS, &|| delegate(slice, seed));
+            total_duration += time(ITERATIONS, &|| delegate(slice, S::default()));
             runs += 1;
         }
         let throughput = (len as f64) / (1024f64 * 1024f64 * (total_duration.as_secs_f64() / runs as f64 / ITERATIONS as f64));

diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
@@ -1,5 +1,7 @@
 pub(crate) mod platform;
 
+use std::intrinsics::likely;
+
 use platform::*;
 
 /// Hashes an arbitrary stream of bytes to an u32.
@@ -12,7 +14,7 @@ use platform::*;
 /// println!("Hash is {:x}!", gxhash::gxhash32(&bytes, seed));
 /// ```
 #[inline(always)]
-pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
+pub fn gxhash32(input: &[u8], seed: i64) -> u32 {
     unsafe {
         let p = &gxhash(input, create_seed(seed)) as *const State as *const u32;
         *p
@@ -29,7 +31,7 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
 /// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed));
 /// ```
 #[inline(always)]
-pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
+pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
     unsafe {
         let p = &gxhash(input, create_seed(seed)) as *const State as *const u64;
         *p
@@ -46,7 +48,7 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
 /// println!("Hash is {:x}!", gxhash::gxhash128(&bytes, seed));
 /// ```
 #[inline(always)]
-pub fn gxhash128(input: &[u8], seed: i32) -> u128 {
+pub fn gxhash128(input: &[u8], seed: i64) -> u128 {
     unsafe {
         let p = &gxhash(input, create_seed(seed)) as *const State as *const u128;
         *p
@@ -58,18 +60,12 @@ macro_rules! load_unaligned {
         $(
             #[allow(unused_mut)]
             let mut $var = load_unaligned($ptr);
+            #[allow(unused_assignments)]
             $ptr = ($ptr).offset(1);
         )+
     };
 }
 
-const RANGE_1_BEGIN: usize  = VECTOR_SIZE + 1;
-const RANGE_1_END: usize    = VECTOR_SIZE * 2;
-const RANGE_2_BEGIN: usize  = RANGE_1_BEGIN + 1;
-const RANGE_2_END: usize    = VECTOR_SIZE * 3;
-const RANGE_3_BEGIN: usize  = RANGE_2_BEGIN + 1;
-const RANGE_3_END: usize    = VECTOR_SIZE * 4;
-
 #[inline(always)]
 pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
     finalize(compress_all(input), seed)
@@ -78,40 +74,51 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
 #[inline(always)]
 unsafe fn compress_all(input: &[u8]) -> State {
 
-    let len: usize = input.len();
+    let len = input.len();
     let mut ptr = input.as_ptr() as *const State;
 
-    let (mut hash_vector, remaining_bytes, p) = match len {
-        // Fast path with no compression for payloads that fit in a single state
-        0..=VECTOR_SIZE => {
-            (get_partial(ptr, len), 0, ptr)
-        },
-        RANGE_1_BEGIN..=RANGE_1_END => {
-            load_unaligned!(ptr, v1);
-            (v1, len - VECTOR_SIZE, ptr)
-        },
-        RANGE_2_BEGIN..=RANGE_2_END => {
-            load_unaligned!(ptr, v1, v2);
-            (compress(v1, v2), len - VECTOR_SIZE * 2, ptr)
-        },
-        RANGE_3_BEGIN..=RANGE_3_END => {
-            load_unaligned!(ptr, v1, v2, v3);
-            (compress(compress(v1, v2), v3), len - VECTOR_SIZE * 3, ptr)
-        },
-        _ => {
-            compress_many(ptr, create_empty(), len)
-        }
-    };
+    if likely(len <= VECTOR_SIZE) {
+        // Input fits on a single SIMD vector, however we might read beyond the input message
+        // Thus we need this safe method that checks if it can safely read beyond or must copy
+        return get_partial(ptr, len);
+    }
+
+    let remaining_bytes = len % VECTOR_SIZE;
 
-    if remaining_bytes > 0 {
-        hash_vector = compress(hash_vector, get_partial(p, remaining_bytes))
+    // The input does not fit on a single SIMD vector
+    let hash_vector: State;
+    if remaining_bytes == 0 {
+        load_unaligned!(ptr, v0);
+        hash_vector = v0;
+    } else {
+        // If the input length does not match the length of a whole number of SIMD vectors,
+        // it means we'll need to read a partial vector. We can start with the partial vector first,
+        // so that we can safely read beyond since we expect the following bytes to still be part of
+        // the input
+        hash_vector = get_partial_unsafe(ptr,remaining_bytes as usize);
+        ptr = ptr.byte_add(remaining_bytes);
     }
 
-    hash_vector
+    if len <= VECTOR_SIZE * 2 {
+        // Fast path when input length > 16 and <= 32
+        load_unaligned!(ptr, v0);
+        compress(hash_vector, v0)
+    } else if len <= VECTOR_SIZE * 3 {
+        // Fast path when input length > 32 and <= 48
+        load_unaligned!(ptr, v0, v1);
+        compress(hash_vector, compress(v0, v1))
+    } else if len <= VECTOR_SIZE * 4 {
+        // Fast path when input length > 48 and <= 64
+        load_unaligned!(ptr, v0, v1, v2);
+        compress(hash_vector, compress(compress(v0, v1), v2))
+    } else {
+        // Input message is large and we can use the high ILP loop
+        compress_many(ptr, hash_vector, len)
+    }
 }
 
 #[inline(always)]
-unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> (State, usize, *const State) {
+unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State {
 
     const UNROLL_FACTOR: usize = 8;
 
@@ -143,8 +150,7 @@ unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_byt
         hash_vector = compress(hash_vector, v0);
     }
 
-    let remaining_bytes: usize = remaining_bytes & (VECTOR_SIZE - 1);
-    (hash_vector, remaining_bytes, ptr)
+    hash_vector
 }
 
 #[cfg(test)]
@@ -269,6 +275,7 @@ mod tests {
     fn is_stable() {
         assert_eq!(456576800, gxhash32(&[0u8; 0], 0));
         assert_eq!(978957914, gxhash32(&[0u8; 1], 0));
-        assert_eq!(3128839713, gxhash32(&[42u8; 1000], 1234));
+        assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0));
+        assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42));
     }
 }
diff --git a/src/gxhash/platform/arm_128.rs b/src/gxhash/platform/arm_128.rs
@@ -20,8 +20,8 @@ pub unsafe fn create_empty() -> State {
 }
 
 #[inline(always)]
-pub unsafe fn create_seed(seed: i32) -> State {
-    vreinterpretq_s8_s32(vdupq_n_s32(seed))
+pub unsafe fn create_seed(seed: i64) -> State {
+    vreinterpretq_s8_s64(vdupq_n_s64(seed))
 }
 
 #[inline(always)]
@@ -31,27 +31,30 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
 
 #[inline(always)]
 pub unsafe fn get_partial(p: *const State, len: usize) -> State {
-    let partial_vector: State;
     if likely(check_same_page(p)) {
-        // Unsafe (hence the check) but much faster
-        let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
-        let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
-        partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8);
+        get_partial_unsafe(p, len)
     } else {
-        partial_vector = get_partial_safe(p as *const i8, len as usize);
+        get_partial_safe(p, len)
     }
-    // Prevents padded zeroes to introduce bias
-    return vaddq_s8(partial_vector, vdupq_n_s8(len as i8));
 }
 
 #[inline(never)]
-unsafe fn get_partial_safe(data: *const i8, len: usize) -> State {
+pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
     let mut buffer = [0i8; VECTOR_SIZE];
     // Copy data into the buffer
-    std::ptr::copy(data, buffer.as_mut_ptr(), len);
+    std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
     // Load the buffer into a __m256i vector
-    vld1q_s8(buffer.as_ptr())
+    let partial_vector = vld1q_s8(buffer.as_ptr());
+    vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
+}
+
+#[inline(always)]
+pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
+    let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
+    let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
+    let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8);
+    vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
 }
 
 #[inline(always)]