Read partial vector first

ogxd · Nov 17, 2023 · 791cfbd · 791cfbd
1 parent 69b7c7c
commit 791cfbd
Show file tree

Hide file tree

Showing 10 changed files with 161 additions and 137 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -20,6 +20,8 @@ jobs:
     - name: Switch to nightly
       run: rustup default nightly
     - name: Build
-      run: cargo build --verbose
+      run: cargo build --release
     - name: Run tests
-      run: cargo test --verbose
+      run: cargo test --release
+    - name: Benchmark
+      run: cargo bench --bench throughput
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "gxhash"
-author = "Olivier Giniaux"
-version = "1.1.1"
+authors = ["Olivier Giniaux"]
+version = "2.0.0"
 edition = "2021"
 description = "GxHash non-cryptographic algorithm"
 license = "MIT"

diff --git a/README.md b/README.md
@@ -46,30 +46,30 @@ To run the benchmarks: `cargo bench --bench throughput`.
 
 ### Intel Ice Lake (x86 64-bit) (GCP n2-standard-2)
 
-| Method      |       4 |       16 |       64 |      256 |     1024 |     4096 |    16384 |
-|-------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:|
-| gxhash-avx2 | 4021.94 | 16113.58 | 42936.69 |  72145.2 | 94127.12 | 98261.24 | 100333.4 |
-| gxhash      | 6122.63 | 24476.94 |  25591.9 | 51949.28 | 61253.58 | 64774.75 | 65708.38 |
-| xxhash      |  915.69 |  4266.94 | 10339.13 | 10116.71 | 17164.93 | 20135.65 | 22834.07 |
-| ahash       | 1838.59 |  8712.95 | 22473.84 | 25958.66 | 35090.25 | 38440.04 |  39308.7 |
-| t1ha0       |  740.15 |  2707.93 |  8572.39 | 28659.06 | 51202.34 | 59918.76 | 65902.36 |
-| seahash     |  213.04 |   620.54 |  1762.72 |  2473.87 |  2761.71 |  2837.24 |  2860.51 |
-| metrohash   |  754.55 |  2556.83 |  5983.26 | 10395.86 | 12738.02 | 13492.63 | 13624.54 |
-| highwayhash |  122.52 |   490.89 |  3278.71 |  7057.25 |  9726.72 | 10743.01 | 11036.79 |
-| fnv-1a      | 1169.76 |  3062.36 |  1602.71 |   933.96 |   833.82 |   811.77 |   808.07 |
+| Method      |    4 |    16 |    64 |   256 |  1024 |   4096 |  16384 |
+|-------------|-----:|------:|------:|------:|------:|-------:|-------:|
+| gxhash-avx2 | 4189 | 16734 | 46142 | 72679 | 96109 | 102202 | 100845 |
+| gxhash      | 6069 | 24283 | 29465 | 49542 | 58164 |  62511 |  64281 |
+| xxhash      |  915 |  4266 | 10339 | 10116 | 17164 |  20135 |  22834 |
+| ahash       | 1838 |  8712 | 22473 | 25958 | 35090 |  38440 |  39308 |
+| t1ha0       |  740 |  2707 |  8572 | 28659 | 51202 |  59918 |  65902 |
+| seahash     |  213 |   620 |  1762 |  2473 |  2761 |   2837 |   2860 |
+| metrohash   |  754 |  2556 |  5983 | 10395 | 12738 |  13492 |  13624 |
+| highwayhash |  122 |   490 |  3278 |  7057 |  9726 |  10743 |  11036 |
+| fnv-1a      | 1169 |  3062 |  1602 |   933 |   833 |    811 |    808 |
 
 ### Macbook M1 Pro (ARM 64-bit)
 
-| Method             |       4 |       16 |       64 |      256 |     1024 |     4096 |    16384 |
-|--------------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:|
-| gxhash             | 5441.06 | 21635.99 | 26282.95 | 59859.19 | 70175.71 | 74723.96 | 75020.74 |
-| xxhash             | 1407.55 |  5638.49 | 11432.47 |  8380.32 | 16289.65 | 18690.69 | 19310.57 |
-| ahash              | 1471.71 |  5920.45 | 15597.47 |  22280.2 | 28672.62 |    29631 | 31174.07 |
-| t1ha0              | 1181.94 |  4254.77 | 10277.71 | 15459.97 | 14120.73 | 13741.89 |  13743.4 |
-| seahash            |    1130 |   4428.8 |   8756.7 |   9248.1 |  8357.73 |  8085.24 |   8056.4 |
-| metrohash          |  1094.4 |  3389.34 |  9709.14 | 14431.34 |    17470 | 17679.48 |  17931.1 |
-| highwayhash        |  182.95 |   743.38 |  2696.71 |  5196.88 |  6573.42 |  7061.91 |  7170.97 |
-| fnv-1a             | 1988.88 |  2627.51 |   1407.3 |   896.08 |   777.74 |   753.23 |   745.68 |
+| Method      |    4 |    16 |    64 |   256 |  1024 |  4096 |  16384 |
+|-------------|-----:|------:|------:|------:|------:|------:|----- -:|
+| gxhash      | 6192 | 24901 | 31770 | 59465 | 72476 | 74723 |  76746 |
+| xxhash      | 1407 |  5638 | 11432 |  8380 | 16289 | 18690 |  19310 |
+| ahash       | 1471 |  5920 | 15597 | 22280 | 28672 | 29631 |  31174 |
+| t1ha0       | 1181 |  4254 | 10277 | 15459 | 14120 | 13741 |  13743 |
+| seahash     | 1130 |  4428 |  8756 |  9248 |  8357 |  8085 |   8056 |
+| metrohash   | 1094 |  3389 |  9709 | 14431 | 17470 | 17679 |  17931 |
+| highwayhash |  182 |   743 |  2696 |  5196 |  6573 |  7061 |   7170 |
+| fnv-1a      | 1988 |  2627 |  1407 |   896 |   777 |   753 |    745 |
 
 ## Debugging
 The algorithm is mostly inlined, making most profilers fail at providing useful intrinsics. The best I could achieve is profiling at assembly level. [cargo-asm](https://github.com/gnzlbg/cargo-asm) is an easy way to view the actual generated assembly code (`cargo asm gxhash::gxhash::gxhash`). [AMD μProf](https://www.amd.com/en/developer/uprof.html) gives some useful insights on time spent per instruction.

diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs
@@ -13,7 +13,7 @@ use rand::Rng;
 use gxhash::*;
 
 const ITERATIONS: u32 = 1000;
-const MAX_RUN_DURATION: Duration = Duration::from_millis(500);
+const MAX_RUN_DURATION: Duration = Duration::from_millis(1000);
 const FORCE_NO_INLINING: bool = false;
 
 fn main() {
@@ -31,13 +31,13 @@ fn main() {
 
     // GxHash
     let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" };
-    benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i32| -> u64 {
+    benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i64| -> u64 {
         gxhash64(data, seed)
     });
 
     // XxHash (twox-hash)
-    benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: i32| -> u64 {
-        twox_hash::xxh3::hash64_with_seed(data, seed as u64)
+    benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: u64| -> u64 {
+        twox_hash::xxh3::hash64_with_seed(data, seed)
     });
 
     // AHash
@@ -47,13 +47,13 @@ fn main() {
     });
 
     // T1ha0
-    benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: i32| -> u64 {
-        t1ha::t1ha0(data, seed as u64)
+    benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: u64| -> u64 {
+        t1ha::t1ha0(data, seed)
     });
 
     // SeaHash
-    benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: i32| -> u64 {
-        seahash::hash_seeded(data, seed as u64, 0, 0, 0)
+    benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: u64| -> u64 {
+        seahash::hash_seeded(data, seed, 0, 0, 0)
     });
 
     // MetroHash
@@ -70,8 +70,8 @@ fn main() {
     });
 
     // FNV-1a
-    benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: i32| -> u64 {
-        let mut fnv_hasher = fnv::FnvHasher::with_key(seed as u64);
+    benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: u64| -> u64 {
+        let mut fnv_hasher = fnv::FnvHasher::with_key(seed);
         fnv_hasher.write(data);
         fnv_hasher.finish()
     });
@@ -80,8 +80,8 @@ fn main() {
     unsafe { dealloc(ptr, layout) };
 }
 
-fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
-    where F: Fn(&[u8], i32) -> u64
+fn benchmark<F, S>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
+    where F: Fn(&[u8], S) -> u64, S: Default + TryFrom<u128> + TryInto<usize>
 {
     processor.on_start(name);
     for i in 2.. {
@@ -91,21 +91,23 @@ fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delega
         }
 
         // Warmup
-        black_box(time(ITERATIONS, &|| delegate(&data[..len], 0))); 
+        black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default()))); 
 
         let mut total_duration: Duration = Duration::ZERO;
         let mut runs: usize = 0;
         let now = Instant::now();
         while now.elapsed() < MAX_RUN_DURATION {
-            // Prevent optimizations from predictable seed
-            let seed = total_duration.as_nanos() as i32;
-            // Prevent optimizations from predictable slice
-            // Also makes the benchmark use both aligned on unaligned data
-            let start = seed as usize & 0xFF;
+            // Make seed unpredictable to prevent optimizations
+            let seed = S::try_from(total_duration.as_nanos())
+                .unwrap_or_else(|_| panic!("Something went horribly wrong!"));
+            // Offset slice by an unpredictable amount to prevent optimization (pre caching)
+            // and make the benchmark use both aligned and unaligned data
+            let start = S::try_into(seed)
+                .unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF;
             let end = start + len;
             let slice = &data[start..end];
             // Execute method for a new iterations
-            total_duration += time(ITERATIONS, &|| delegate(slice, seed));
+            total_duration += time(ITERATIONS, &|| delegate(slice, S::default()));
             runs += 1;
         }
         let throughput = (len as f64) / (1024f64 * 1024f64 * (total_duration.as_secs_f64() / runs as f64 / ITERATIONS as f64));

diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
@@ -1,5 +1,7 @@
 pub(crate) mod platform;
 
+use std::intrinsics::likely;
+
 use platform::*;
 
 /// Hashes an arbitrary stream of bytes to an u32.
@@ -12,7 +14,7 @@ use platform::*;
 /// println!("Hash is {:x}!", gxhash::gxhash32(&bytes, seed));
 /// ```
 #[inline(always)]
-pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
+pub fn gxhash32(input: &[u8], seed: i64) -> u32 {
     unsafe {
         let p = &gxhash(input, create_seed(seed)) as *const State as *const u32;
         *p
@@ -29,7 +31,7 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
 /// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed));
 /// ```
 #[inline(always)]
-pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
+pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
     unsafe {
         let p = &gxhash(input, create_seed(seed)) as *const State as *const u64;
         *p
@@ -46,7 +48,7 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
 /// println!("Hash is {:x}!", gxhash::gxhash128(&bytes, seed));
 /// ```
 #[inline(always)]
-pub fn gxhash128(input: &[u8], seed: i32) -> u128 {
+pub fn gxhash128(input: &[u8], seed: i64) -> u128 {
     unsafe {
         let p = &gxhash(input, create_seed(seed)) as *const State as *const u128;
         *p
@@ -58,18 +60,12 @@ macro_rules! load_unaligned {
         $(
             #[allow(unused_mut)]
             let mut $var = load_unaligned($ptr);
+            #[allow(unused_assignments)]
             $ptr = ($ptr).offset(1);
         )+
     };
 }
 
-const RANGE_1_BEGIN: usize  = VECTOR_SIZE + 1;
-const RANGE_1_END: usize    = VECTOR_SIZE * 2;
-const RANGE_2_BEGIN: usize  = RANGE_1_BEGIN + 1;
-const RANGE_2_END: usize    = VECTOR_SIZE * 3;
-const RANGE_3_BEGIN: usize  = RANGE_2_BEGIN + 1;
-const RANGE_3_END: usize    = VECTOR_SIZE * 4;
-
 #[inline(always)]
 pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
     finalize(compress_all(input), seed)
@@ -78,40 +74,51 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
 #[inline(always)]
 unsafe fn compress_all(input: &[u8]) -> State {
 
-    let len: usize = input.len();
+    let len = input.len();
     let mut ptr = input.as_ptr() as *const State;
 
-    let (mut hash_vector, remaining_bytes, p) = match len {
-        // Fast path with no compression for payloads that fit in a single state
-        0..=VECTOR_SIZE => {
-            (get_partial(ptr, len), 0, ptr)
-        },
-        RANGE_1_BEGIN..=RANGE_1_END => {
-            load_unaligned!(ptr, v1);
-            (v1, len - VECTOR_SIZE, ptr)
-        },
-        RANGE_2_BEGIN..=RANGE_2_END => {
-            load_unaligned!(ptr, v1, v2);
-            (compress(v1, v2), len - VECTOR_SIZE * 2, ptr)
-        },
-        RANGE_3_BEGIN..=RANGE_3_END => {
-            load_unaligned!(ptr, v1, v2, v3);
-            (compress(compress(v1, v2), v3), len - VECTOR_SIZE * 3, ptr)
-        },
-        _ => {
-            compress_many(ptr, create_empty(), len)
-        }
-    };
+    if likely(len <= VECTOR_SIZE) {
+        // Input fits on a single SIMD vector, however we might read beyond the input message
+        // Thus we need this safe method that checks if it can safely read beyond or must copy
+        return get_partial(ptr, len);
+    }
+
+    let remaining_bytes = len % VECTOR_SIZE;
 
-    if remaining_bytes > 0 {
-        hash_vector = compress(hash_vector, get_partial(p, remaining_bytes))
+    // The input does not fit on a single SIMD vector
+    let hash_vector: State;
+    if remaining_bytes == 0 {
+        load_unaligned!(ptr, v0);
+        hash_vector = v0;
+    } else {
+        // If the input length does not match the length of a whole number of SIMD vectors,
+        // it means we'll need to read a partial vector. We can start with the partial vector first,
+        // so that we can safely read beyond since we expect the following bytes to still be part of
+        // the input
+        hash_vector = get_partial_unsafe(ptr,remaining_bytes as usize);
+        ptr = ptr.byte_add(remaining_bytes);
     }
 
-    hash_vector
+    if len <= VECTOR_SIZE * 2 {
+        // Fast path when input length > 16 and <= 32
+        load_unaligned!(ptr, v0);
+        compress(hash_vector, v0)
+    } else if len <= VECTOR_SIZE * 3 {
+        // Fast path when input length > 32 and <= 48
+        load_unaligned!(ptr, v0, v1);
+        compress(hash_vector, compress(v0, v1))
+    } else if len <= VECTOR_SIZE * 4 {
+        // Fast path when input length > 48 and <= 64
+        load_unaligned!(ptr, v0, v1, v2);
+        compress(hash_vector, compress(compress(v0, v1), v2))
+    } else {
+        // Input message is large and we can use the high ILP loop
+        compress_many(ptr, hash_vector, len)
+    }
 }
 
 #[inline(always)]
-unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> (State, usize, *const State) {
+unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State {
 
     const UNROLL_FACTOR: usize = 8;
 
@@ -143,8 +150,7 @@ unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_byt
         hash_vector = compress(hash_vector, v0);
     }
 
-    let remaining_bytes: usize = remaining_bytes & (VECTOR_SIZE - 1);
-    (hash_vector, remaining_bytes, ptr)
+    hash_vector
 }
 
 #[cfg(test)]
@@ -269,6 +275,7 @@ mod tests {
     fn is_stable() {
         assert_eq!(456576800, gxhash32(&[0u8; 0], 0));
         assert_eq!(978957914, gxhash32(&[0u8; 1], 0));
-        assert_eq!(3128839713, gxhash32(&[42u8; 1000], 1234));
+        assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0));
+        assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42));
     }
 }
diff --git a/src/gxhash/platform/arm_128.rs b/src/gxhash/platform/arm_128.rs
@@ -20,8 +20,8 @@ pub unsafe fn create_empty() -> State {
 }
 
 #[inline(always)]
-pub unsafe fn create_seed(seed: i32) -> State {
-    vreinterpretq_s8_s32(vdupq_n_s32(seed))
+pub unsafe fn create_seed(seed: i64) -> State {
+    vreinterpretq_s8_s64(vdupq_n_s64(seed))
 }
 
 #[inline(always)]
@@ -31,27 +31,30 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
 
 #[inline(always)]
 pub unsafe fn get_partial(p: *const State, len: usize) -> State {
-    let partial_vector: State;
     if likely(check_same_page(p)) {
-        // Unsafe (hence the check) but much faster
-        let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
-        let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
-        partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8);
+        get_partial_unsafe(p, len)
     } else {
-        partial_vector = get_partial_safe(p as *const i8, len as usize);
+        get_partial_safe(p, len)
     }
-    // Prevents padded zeroes to introduce bias
-    return vaddq_s8(partial_vector, vdupq_n_s8(len as i8));
 }
 
 #[inline(never)]
-unsafe fn get_partial_safe(data: *const i8, len: usize) -> State {
+pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
     let mut buffer = [0i8; VECTOR_SIZE];
     // Copy data into the buffer
-    std::ptr::copy(data, buffer.as_mut_ptr(), len);
+    std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
     // Load the buffer into a __m256i vector
-    vld1q_s8(buffer.as_ptr())
+    let partial_vector = vld1q_s8(buffer.as_ptr());
+    vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
+}
+
+#[inline(always)]
+pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
+    let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
+    let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
+    let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8);
+    vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
 }
 
 #[inline(always)]