From 791cfbd619926386cc955dbf59f536283f3f5386 Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Thu, 16 Nov 2023 17:11:56 +0100 Subject: [PATCH] Read partial vector first --- .github/workflows/rust.yml | 6 ++- Cargo.toml | 4 +- README.md | 42 ++++++++--------- benches/throughput/main.rs | 40 ++++++++-------- src/gxhash/mod.rs | 85 ++++++++++++++++++---------------- src/gxhash/platform/arm_128.rs | 29 ++++++------ src/gxhash/platform/x86_128.rs | 44 ++++++++++-------- src/gxhash/platform/x86_256.rs | 44 ++++++++++-------- src/hasher.rs | 2 +- src/lib.rs | 2 + 10 files changed, 161 insertions(+), 137 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 2bcc45a..a21302c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -20,6 +20,8 @@ jobs: - name: Switch to nightly run: rustup default nightly - name: Build - run: cargo build --verbose + run: cargo build --release - name: Run tests - run: cargo test --verbose \ No newline at end of file + run: cargo test --release + - name: Benchmark + run: cargo bench --bench throughput \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index c2115a4..ebad45c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "gxhash" -author = "Olivier Giniaux" -version = "1.1.1" +authors = ["Olivier Giniaux"] +version = "2.0.0" edition = "2021" description = "GxHash non-cryptographic algorithm" license = "MIT" diff --git a/README.md b/README.md index 1e87999..476dac2 100644 --- a/README.md +++ b/README.md @@ -46,30 +46,30 @@ To run the benchmarks: `cargo bench --bench throughput`. ### Intel Ice Lake (x86 64-bit) (GCP n2-standard-2) -| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 | -|-------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:| -| gxhash-avx2 | 4021.94 | 16113.58 | 42936.69 | 72145.2 | 94127.12 | 98261.24 | 100333.4 | -| gxhash | 6122.63 | 24476.94 | 25591.9 | 51949.28 | 61253.58 | 64774.75 | 65708.38 | -| xxhash | 915.69 | 4266.94 | 10339.13 | 10116.71 | 17164.93 | 20135.65 | 22834.07 | -| ahash | 1838.59 | 8712.95 | 22473.84 | 25958.66 | 35090.25 | 38440.04 | 39308.7 | -| t1ha0 | 740.15 | 2707.93 | 8572.39 | 28659.06 | 51202.34 | 59918.76 | 65902.36 | -| seahash | 213.04 | 620.54 | 1762.72 | 2473.87 | 2761.71 | 2837.24 | 2860.51 | -| metrohash | 754.55 | 2556.83 | 5983.26 | 10395.86 | 12738.02 | 13492.63 | 13624.54 | -| highwayhash | 122.52 | 490.89 | 3278.71 | 7057.25 | 9726.72 | 10743.01 | 11036.79 | -| fnv-1a | 1169.76 | 3062.36 | 1602.71 | 933.96 | 833.82 | 811.77 | 808.07 | +| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 | +|-------------|-----:|------:|------:|------:|------:|-------:|-------:| +| gxhash-avx2 | 4189 | 16734 | 46142 | 72679 | 96109 | 102202 | 100845 | +| gxhash | 6069 | 24283 | 29465 | 49542 | 58164 | 62511 | 64281 | +| xxhash | 915 | 4266 | 10339 | 10116 | 17164 | 20135 | 22834 | +| ahash | 1838 | 8712 | 22473 | 25958 | 35090 | 38440 | 39308 | +| t1ha0 | 740 | 2707 | 8572 | 28659 | 51202 | 59918 | 65902 | +| seahash | 213 | 620 | 1762 | 2473 | 2761 | 2837 | 2860 | +| metrohash | 754 | 2556 | 5983 | 10395 | 12738 | 13492 | 13624 | +| highwayhash | 122 | 490 | 3278 | 7057 | 9726 | 10743 | 11036 | +| fnv-1a | 1169 | 3062 | 1602 | 933 | 833 | 811 | 808 | ### Macbook M1 Pro (ARM 64-bit) -| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 | -|--------------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:| -| gxhash | 5441.06 | 21635.99 | 26282.95 | 59859.19 | 70175.71 | 74723.96 | 75020.74 | -| xxhash | 1407.55 | 5638.49 | 11432.47 | 8380.32 | 16289.65 | 18690.69 | 19310.57 | -| ahash | 1471.71 | 5920.45 | 15597.47 | 22280.2 | 28672.62 | 29631 | 31174.07 | -| t1ha0 | 1181.94 | 4254.77 | 10277.71 | 15459.97 | 14120.73 | 13741.89 | 13743.4 | -| seahash | 1130 | 4428.8 | 8756.7 | 9248.1 | 8357.73 | 8085.24 | 8056.4 | -| metrohash | 1094.4 | 3389.34 | 9709.14 | 14431.34 | 17470 | 17679.48 | 17931.1 | -| highwayhash | 182.95 | 743.38 | 2696.71 | 5196.88 | 6573.42 | 7061.91 | 7170.97 | -| fnv-1a | 1988.88 | 2627.51 | 1407.3 | 896.08 | 777.74 | 753.23 | 745.68 | +| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 | +|-------------|-----:|------:|------:|------:|------:|------:|----- -:| +| gxhash | 6192 | 24901 | 31770 | 59465 | 72476 | 74723 | 76746 | +| xxhash | 1407 | 5638 | 11432 | 8380 | 16289 | 18690 | 19310 | +| ahash | 1471 | 5920 | 15597 | 22280 | 28672 | 29631 | 31174 | +| t1ha0 | 1181 | 4254 | 10277 | 15459 | 14120 | 13741 | 13743 | +| seahash | 1130 | 4428 | 8756 | 9248 | 8357 | 8085 | 8056 | +| metrohash | 1094 | 3389 | 9709 | 14431 | 17470 | 17679 | 17931 | +| highwayhash | 182 | 743 | 2696 | 5196 | 6573 | 7061 | 7170 | +| fnv-1a | 1988 | 2627 | 1407 | 896 | 777 | 753 | 745 | ## Debugging The algorithm is mostly inlined, making most profilers fail at providing useful intrinsics. The best I could achieve is profiling at assembly level. [cargo-asm](https://github.com/gnzlbg/cargo-asm) is an easy way to view the actual generated assembly code (`cargo asm gxhash::gxhash::gxhash`). [AMD μProf](https://www.amd.com/en/developer/uprof.html) gives some useful insights on time spent per instruction. diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs index 051a8e9..2ed5819 100644 --- a/benches/throughput/main.rs +++ b/benches/throughput/main.rs @@ -13,7 +13,7 @@ use rand::Rng; use gxhash::*; const ITERATIONS: u32 = 1000; -const MAX_RUN_DURATION: Duration = Duration::from_millis(500); +const MAX_RUN_DURATION: Duration = Duration::from_millis(1000); const FORCE_NO_INLINING: bool = false; fn main() { @@ -31,13 +31,13 @@ fn main() { // GxHash let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" }; - benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i32| -> u64 { + benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i64| -> u64 { gxhash64(data, seed) }); // XxHash (twox-hash) - benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: i32| -> u64 { - twox_hash::xxh3::hash64_with_seed(data, seed as u64) + benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: u64| -> u64 { + twox_hash::xxh3::hash64_with_seed(data, seed) }); // AHash @@ -47,13 +47,13 @@ fn main() { }); // T1ha0 - benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: i32| -> u64 { - t1ha::t1ha0(data, seed as u64) + benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: u64| -> u64 { + t1ha::t1ha0(data, seed) }); // SeaHash - benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: i32| -> u64 { - seahash::hash_seeded(data, seed as u64, 0, 0, 0) + benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: u64| -> u64 { + seahash::hash_seeded(data, seed, 0, 0, 0) }); // MetroHash @@ -70,8 +70,8 @@ fn main() { }); // FNV-1a - benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: i32| -> u64 { - let mut fnv_hasher = fnv::FnvHasher::with_key(seed as u64); + benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: u64| -> u64 { + let mut fnv_hasher = fnv::FnvHasher::with_key(seed); fnv_hasher.write(data); fnv_hasher.finish() }); @@ -80,8 +80,8 @@ fn main() { unsafe { dealloc(ptr, layout) }; } -fn benchmark(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F) - where F: Fn(&[u8], i32) -> u64 +fn benchmark(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F) + where F: Fn(&[u8], S) -> u64, S: Default + TryFrom + TryInto { processor.on_start(name); for i in 2.. { @@ -91,21 +91,23 @@ fn benchmark(processor: &mut ResultProcessor, data: &[u8], name: &str, delega } // Warmup - black_box(time(ITERATIONS, &|| delegate(&data[..len], 0))); + black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default()))); let mut total_duration: Duration = Duration::ZERO; let mut runs: usize = 0; let now = Instant::now(); while now.elapsed() < MAX_RUN_DURATION { - // Prevent optimizations from predictable seed - let seed = total_duration.as_nanos() as i32; - // Prevent optimizations from predictable slice - // Also makes the benchmark use both aligned on unaligned data - let start = seed as usize & 0xFF; + // Make seed unpredictable to prevent optimizations + let seed = S::try_from(total_duration.as_nanos()) + .unwrap_or_else(|_| panic!("Something went horribly wrong!")); + // Offset slice by an unpredictable amount to prevent optimization (pre caching) + // and make the benchmark use both aligned and unaligned data + let start = S::try_into(seed) + .unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF; let end = start + len; let slice = &data[start..end]; // Execute method for a new iterations - total_duration += time(ITERATIONS, &|| delegate(slice, seed)); + total_duration += time(ITERATIONS, &|| delegate(slice, S::default())); runs += 1; } let throughput = (len as f64) / (1024f64 * 1024f64 * (total_duration.as_secs_f64() / runs as f64 / ITERATIONS as f64)); diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index b2e9c40..08a8833 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -1,5 +1,7 @@ pub(crate) mod platform; +use std::intrinsics::likely; + use platform::*; /// Hashes an arbitrary stream of bytes to an u32. @@ -12,7 +14,7 @@ use platform::*; /// println!("Hash is {:x}!", gxhash::gxhash32(&bytes, seed)); /// ``` #[inline(always)] -pub fn gxhash32(input: &[u8], seed: i32) -> u32 { +pub fn gxhash32(input: &[u8], seed: i64) -> u32 { unsafe { let p = &gxhash(input, create_seed(seed)) as *const State as *const u32; *p @@ -29,7 +31,7 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 { /// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed)); /// ``` #[inline(always)] -pub fn gxhash64(input: &[u8], seed: i32) -> u64 { +pub fn gxhash64(input: &[u8], seed: i64) -> u64 { unsafe { let p = &gxhash(input, create_seed(seed)) as *const State as *const u64; *p @@ -46,7 +48,7 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 { /// println!("Hash is {:x}!", gxhash::gxhash128(&bytes, seed)); /// ``` #[inline(always)] -pub fn gxhash128(input: &[u8], seed: i32) -> u128 { +pub fn gxhash128(input: &[u8], seed: i64) -> u128 { unsafe { let p = &gxhash(input, create_seed(seed)) as *const State as *const u128; *p @@ -58,18 +60,12 @@ macro_rules! load_unaligned { $( #[allow(unused_mut)] let mut $var = load_unaligned($ptr); + #[allow(unused_assignments)] $ptr = ($ptr).offset(1); )+ }; } -const RANGE_1_BEGIN: usize = VECTOR_SIZE + 1; -const RANGE_1_END: usize = VECTOR_SIZE * 2; -const RANGE_2_BEGIN: usize = RANGE_1_BEGIN + 1; -const RANGE_2_END: usize = VECTOR_SIZE * 3; -const RANGE_3_BEGIN: usize = RANGE_2_BEGIN + 1; -const RANGE_3_END: usize = VECTOR_SIZE * 4; - #[inline(always)] pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State { finalize(compress_all(input), seed) @@ -78,40 +74,51 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State { #[inline(always)] unsafe fn compress_all(input: &[u8]) -> State { - let len: usize = input.len(); + let len = input.len(); let mut ptr = input.as_ptr() as *const State; - let (mut hash_vector, remaining_bytes, p) = match len { - // Fast path with no compression for payloads that fit in a single state - 0..=VECTOR_SIZE => { - (get_partial(ptr, len), 0, ptr) - }, - RANGE_1_BEGIN..=RANGE_1_END => { - load_unaligned!(ptr, v1); - (v1, len - VECTOR_SIZE, ptr) - }, - RANGE_2_BEGIN..=RANGE_2_END => { - load_unaligned!(ptr, v1, v2); - (compress(v1, v2), len - VECTOR_SIZE * 2, ptr) - }, - RANGE_3_BEGIN..=RANGE_3_END => { - load_unaligned!(ptr, v1, v2, v3); - (compress(compress(v1, v2), v3), len - VECTOR_SIZE * 3, ptr) - }, - _ => { - compress_many(ptr, create_empty(), len) - } - }; + if likely(len <= VECTOR_SIZE) { + // Input fits on a single SIMD vector, however we might read beyond the input message + // Thus we need this safe method that checks if it can safely read beyond or must copy + return get_partial(ptr, len); + } + + let remaining_bytes = len % VECTOR_SIZE; - if remaining_bytes > 0 { - hash_vector = compress(hash_vector, get_partial(p, remaining_bytes)) + // The input does not fit on a single SIMD vector + let hash_vector: State; + if remaining_bytes == 0 { + load_unaligned!(ptr, v0); + hash_vector = v0; + } else { + // If the input length does not match the length of a whole number of SIMD vectors, + // it means we'll need to read a partial vector. We can start with the partial vector first, + // so that we can safely read beyond since we expect the following bytes to still be part of + // the input + hash_vector = get_partial_unsafe(ptr,remaining_bytes as usize); + ptr = ptr.byte_add(remaining_bytes); } - hash_vector + if len <= VECTOR_SIZE * 2 { + // Fast path when input length > 16 and <= 32 + load_unaligned!(ptr, v0); + compress(hash_vector, v0) + } else if len <= VECTOR_SIZE * 3 { + // Fast path when input length > 32 and <= 48 + load_unaligned!(ptr, v0, v1); + compress(hash_vector, compress(v0, v1)) + } else if len <= VECTOR_SIZE * 4 { + // Fast path when input length > 48 and <= 64 + load_unaligned!(ptr, v0, v1, v2); + compress(hash_vector, compress(compress(v0, v1), v2)) + } else { + // Input message is large and we can use the high ILP loop + compress_many(ptr, hash_vector, len) + } } #[inline(always)] -unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> (State, usize, *const State) { +unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State { const UNROLL_FACTOR: usize = 8; @@ -143,8 +150,7 @@ unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_byt hash_vector = compress(hash_vector, v0); } - let remaining_bytes: usize = remaining_bytes & (VECTOR_SIZE - 1); - (hash_vector, remaining_bytes, ptr) + hash_vector } #[cfg(test)] @@ -269,6 +275,7 @@ mod tests { fn is_stable() { assert_eq!(456576800, gxhash32(&[0u8; 0], 0)); assert_eq!(978957914, gxhash32(&[0u8; 1], 0)); - assert_eq!(3128839713, gxhash32(&[42u8; 1000], 1234)); + assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0)); + assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42)); } } \ No newline at end of file diff --git a/src/gxhash/platform/arm_128.rs b/src/gxhash/platform/arm_128.rs index 0fd79a0..44b875e 100644 --- a/src/gxhash/platform/arm_128.rs +++ b/src/gxhash/platform/arm_128.rs @@ -20,8 +20,8 @@ pub unsafe fn create_empty() -> State { } #[inline(always)] -pub unsafe fn create_seed(seed: i32) -> State { - vreinterpretq_s8_s32(vdupq_n_s32(seed)) +pub unsafe fn create_seed(seed: i64) -> State { + vreinterpretq_s8_s64(vdupq_n_s64(seed)) } #[inline(always)] @@ -31,27 +31,30 @@ pub unsafe fn load_unaligned(p: *const State) -> State { #[inline(always)] pub unsafe fn get_partial(p: *const State, len: usize) -> State { - let partial_vector: State; if likely(check_same_page(p)) { - // Unsafe (hence the check) but much faster - let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); - let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); - partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8); + get_partial_unsafe(p, len) } else { - partial_vector = get_partial_safe(p as *const i8, len as usize); + get_partial_safe(p, len) } - // Prevents padded zeroes to introduce bias - return vaddq_s8(partial_vector, vdupq_n_s8(len as i8)); } #[inline(never)] -unsafe fn get_partial_safe(data: *const i8, len: usize) -> State { +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros let mut buffer = [0i8; VECTOR_SIZE]; // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); // Load the buffer into a __m256i vector - vld1q_s8(buffer.as_ptr()) + let partial_vector = vld1q_s8(buffer.as_ptr()); + vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) +} + +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); + let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8); + vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) } #[inline(always)] diff --git a/src/gxhash/platform/x86_128.rs b/src/gxhash/platform/x86_128.rs index 53eab35..ed16d74 100644 --- a/src/gxhash/platform/x86_128.rs +++ b/src/gxhash/platform/x86_128.rs @@ -4,47 +4,51 @@ use super::*; pub type State = __m128i; -#[inline] +#[inline(always)] pub unsafe fn create_empty() -> State { _mm_setzero_si128() } #[inline(always)] -pub unsafe fn create_seed(seed: i32) -> State { - _mm_set1_epi32(seed) +pub unsafe fn create_seed(seed: i64) -> State { + _mm_set1_epi64x(seed) } -#[inline] +#[inline(always)] pub unsafe fn load_unaligned(p: *const State) -> State { _mm_loadu_si128(p) } -#[inline] +#[inline(always)] pub unsafe fn get_partial(p: *const State, len: usize) -> State { - let partial_vector: State; // Safety check if check_same_page(p) { - let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); - partial_vector = _mm_and_si128(_mm_loadu_si128(p), mask); + get_partial_unsafe(p, len as usize) } else { - partial_vector = get_partial_safe(p as *const u8, len as usize) + get_partial_safe(p, len as usize) } - // Prevents padded zeroes to introduce bias - _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) } -#[inline] -unsafe fn get_partial_safe(data: *const u8, len: usize) -> State { +#[inline(always)] +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros - let mut buffer = [0u8; VECTOR_SIZE]; + let mut buffer = [0i8; VECTOR_SIZE]; // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); // Load the buffer into a __m256i vector - _mm_loadu_si128(buffer.as_ptr() as *const State) + let partial_vector = _mm_loadu_epi8(buffer.as_ptr()); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) } -#[inline] +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) +} + +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress(a: State, b: State) -> State { let keys_1 = _mm_set_epi32(0xF2784542, 0xB09D3E21, 0x89C222E5, 0xFC3BC28E); @@ -56,13 +60,13 @@ pub unsafe fn compress(a: State, b: State) -> State { return _mm_aesenclast_si128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress_fast(a: State, b: State) -> State { return _mm_aesenc_si128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn finalize(hash: State, seed: State) -> State { // Hardcoded AES keys diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs index 5f96ad1..363dc15 100644 --- a/src/gxhash/platform/x86_256.rs +++ b/src/gxhash/platform/x86_256.rs @@ -4,47 +4,51 @@ use super::*; pub type State = __m256i; -#[inline] +#[inline(always)] pub unsafe fn create_empty() -> State { _mm256_setzero_si256() } #[inline(always)] -pub unsafe fn create_seed(seed: i32) -> State { - _mm256_set1_epi32(seed) +pub unsafe fn create_seed(seed: i64) -> State { + _mm256_set1_epi64x(seed) } -#[inline] +#[inline(always)] pub unsafe fn load_unaligned(p: *const State) -> State { _mm256_loadu_si256(p) } -#[inline] +#[inline(always)] pub unsafe fn get_partial(p: *const State, len: usize) -> State { - let partial_vector: State; // Safety check if check_same_page(p) { - let indices = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); - partial_vector = _mm256_and_si256(_mm256_loadu_si256(p), mask); + get_partial_unsafe(p, len as usize) } else { - partial_vector = get_partial_safe(p as *const u8, len as usize) + get_partial_safe(p, len as usize) } - // Prevents padded zeroes to introduce bias - _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) } -#[inline] -unsafe fn get_partial_safe(data: *const u8, len: usize) -> State { +#[inline(always)] +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros - let mut buffer = [0u8; VECTOR_SIZE]; + let mut buffer = [0i8; VECTOR_SIZE]; // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); // Load the buffer into a __m256i vector - _mm256_loadu_si256(buffer.as_ptr() as *const State) + let partial_vector = _mm256_loadu_epi8(buffer.as_ptr()); + _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) } -#[inline] +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); + let partial_vector = _mm256_and_si256(_mm256_loadu_si256(data), mask); + _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) +} + +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress(a: State, b: State) -> State { let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781); @@ -56,13 +60,13 @@ pub unsafe fn compress(a: State, b: State) -> State { return _mm256_aesenclast_epi128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress_fast(a: State, b: State) -> State { return _mm256_aesenc_epi128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn finalize(hash: State, seed: State) -> State { // Hardcoded AES keys diff --git a/src/hasher.rs b/src/hasher.rs index 1991bdc..0b49407 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -31,7 +31,7 @@ impl GxHasher { /// println!("Hash is {:x}!", hasher.finish()); /// ``` #[inline] - pub fn with_seed(seed: i32) -> GxHasher { + pub fn with_seed(seed: i64) -> GxHasher { // Use gxhash64 to generate an initial state from a seed GxHasher(unsafe { gxhash(&[], create_seed(seed)) }) } diff --git a/src/lib.rs b/src/lib.rs index 06251ea..89c3d9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ #![feature(core_intrinsics)] +#![feature(pointer_byte_offsets)] #![feature(stdsimd)] +#![feature(stmt_expr_attributes)] mod gxhash; mod hasher;