From 309fd66aeca3413d818efb09af634c693ff96d70 Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Thu, 16 Nov 2023 17:11:56 +0100 Subject: [PATCH] Read partial vector first --- .github/workflows/rust.yml | 6 ++- Cargo.toml | 4 +- benches/throughput/main.rs | 40 ++++++++-------- src/gxhash/mod.rs | 85 ++++++++++++++++++---------------- src/gxhash/platform/arm_128.rs | 29 ++++++------ src/gxhash/platform/x86_128.rs | 44 ++++++++++-------- src/gxhash/platform/x86_256.rs | 44 ++++++++++-------- src/hasher.rs | 2 +- src/lib.rs | 2 + 9 files changed, 140 insertions(+), 116 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 2bcc45a..a21302c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -20,6 +20,8 @@ jobs: - name: Switch to nightly run: rustup default nightly - name: Build - run: cargo build --verbose + run: cargo build --release - name: Run tests - run: cargo test --verbose \ No newline at end of file + run: cargo test --release + - name: Benchmark + run: cargo bench --bench throughput \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index c2115a4..ebad45c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "gxhash" -author = "Olivier Giniaux" -version = "1.1.1" +authors = ["Olivier Giniaux"] +version = "2.0.0" edition = "2021" description = "GxHash non-cryptographic algorithm" license = "MIT" diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs index 051a8e9..2ed5819 100644 --- a/benches/throughput/main.rs +++ b/benches/throughput/main.rs @@ -13,7 +13,7 @@ use rand::Rng; use gxhash::*; const ITERATIONS: u32 = 1000; -const MAX_RUN_DURATION: Duration = Duration::from_millis(500); +const MAX_RUN_DURATION: Duration = Duration::from_millis(1000); const FORCE_NO_INLINING: bool = false; fn main() { @@ -31,13 +31,13 @@ fn main() { // GxHash let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" }; - benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i32| -> u64 { + benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i64| -> u64 { gxhash64(data, seed) }); // XxHash (twox-hash) - benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: i32| -> u64 { - twox_hash::xxh3::hash64_with_seed(data, seed as u64) + benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: u64| -> u64 { + twox_hash::xxh3::hash64_with_seed(data, seed) }); // AHash @@ -47,13 +47,13 @@ fn main() { }); // T1ha0 - benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: i32| -> u64 { - t1ha::t1ha0(data, seed as u64) + benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: u64| -> u64 { + t1ha::t1ha0(data, seed) }); // SeaHash - benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: i32| -> u64 { - seahash::hash_seeded(data, seed as u64, 0, 0, 0) + benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: u64| -> u64 { + seahash::hash_seeded(data, seed, 0, 0, 0) }); // MetroHash @@ -70,8 +70,8 @@ fn main() { }); // FNV-1a - benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: i32| -> u64 { - let mut fnv_hasher = fnv::FnvHasher::with_key(seed as u64); + benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: u64| -> u64 { + let mut fnv_hasher = fnv::FnvHasher::with_key(seed); fnv_hasher.write(data); fnv_hasher.finish() }); @@ -80,8 +80,8 @@ fn main() { unsafe { dealloc(ptr, layout) }; } -fn benchmark(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F) - where F: Fn(&[u8], i32) -> u64 +fn benchmark(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F) + where F: Fn(&[u8], S) -> u64, S: Default + TryFrom + TryInto { processor.on_start(name); for i in 2.. { @@ -91,21 +91,23 @@ fn benchmark(processor: &mut ResultProcessor, data: &[u8], name: &str, delega } // Warmup - black_box(time(ITERATIONS, &|| delegate(&data[..len], 0))); + black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default()))); let mut total_duration: Duration = Duration::ZERO; let mut runs: usize = 0; let now = Instant::now(); while now.elapsed() < MAX_RUN_DURATION { - // Prevent optimizations from predictable seed - let seed = total_duration.as_nanos() as i32; - // Prevent optimizations from predictable slice - // Also makes the benchmark use both aligned on unaligned data - let start = seed as usize & 0xFF; + // Make seed unpredictable to prevent optimizations + let seed = S::try_from(total_duration.as_nanos()) + .unwrap_or_else(|_| panic!("Something went horribly wrong!")); + // Offset slice by an unpredictable amount to prevent optimization (pre caching) + // and make the benchmark use both aligned and unaligned data + let start = S::try_into(seed) + .unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF; let end = start + len; let slice = &data[start..end]; // Execute method for a new iterations - total_duration += time(ITERATIONS, &|| delegate(slice, seed)); + total_duration += time(ITERATIONS, &|| delegate(slice, S::default())); runs += 1; } let throughput = (len as f64) / (1024f64 * 1024f64 * (total_duration.as_secs_f64() / runs as f64 / ITERATIONS as f64)); diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index b2e9c40..08a8833 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -1,5 +1,7 @@ pub(crate) mod platform; +use std::intrinsics::likely; + use platform::*; /// Hashes an arbitrary stream of bytes to an u32. @@ -12,7 +14,7 @@ use platform::*; /// println!("Hash is {:x}!", gxhash::gxhash32(&bytes, seed)); /// ``` #[inline(always)] -pub fn gxhash32(input: &[u8], seed: i32) -> u32 { +pub fn gxhash32(input: &[u8], seed: i64) -> u32 { unsafe { let p = &gxhash(input, create_seed(seed)) as *const State as *const u32; *p @@ -29,7 +31,7 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 { /// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed)); /// ``` #[inline(always)] -pub fn gxhash64(input: &[u8], seed: i32) -> u64 { +pub fn gxhash64(input: &[u8], seed: i64) -> u64 { unsafe { let p = &gxhash(input, create_seed(seed)) as *const State as *const u64; *p @@ -46,7 +48,7 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 { /// println!("Hash is {:x}!", gxhash::gxhash128(&bytes, seed)); /// ``` #[inline(always)] -pub fn gxhash128(input: &[u8], seed: i32) -> u128 { +pub fn gxhash128(input: &[u8], seed: i64) -> u128 { unsafe { let p = &gxhash(input, create_seed(seed)) as *const State as *const u128; *p @@ -58,18 +60,12 @@ macro_rules! load_unaligned { $( #[allow(unused_mut)] let mut $var = load_unaligned($ptr); + #[allow(unused_assignments)] $ptr = ($ptr).offset(1); )+ }; } -const RANGE_1_BEGIN: usize = VECTOR_SIZE + 1; -const RANGE_1_END: usize = VECTOR_SIZE * 2; -const RANGE_2_BEGIN: usize = RANGE_1_BEGIN + 1; -const RANGE_2_END: usize = VECTOR_SIZE * 3; -const RANGE_3_BEGIN: usize = RANGE_2_BEGIN + 1; -const RANGE_3_END: usize = VECTOR_SIZE * 4; - #[inline(always)] pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State { finalize(compress_all(input), seed) @@ -78,40 +74,51 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State { #[inline(always)] unsafe fn compress_all(input: &[u8]) -> State { - let len: usize = input.len(); + let len = input.len(); let mut ptr = input.as_ptr() as *const State; - let (mut hash_vector, remaining_bytes, p) = match len { - // Fast path with no compression for payloads that fit in a single state - 0..=VECTOR_SIZE => { - (get_partial(ptr, len), 0, ptr) - }, - RANGE_1_BEGIN..=RANGE_1_END => { - load_unaligned!(ptr, v1); - (v1, len - VECTOR_SIZE, ptr) - }, - RANGE_2_BEGIN..=RANGE_2_END => { - load_unaligned!(ptr, v1, v2); - (compress(v1, v2), len - VECTOR_SIZE * 2, ptr) - }, - RANGE_3_BEGIN..=RANGE_3_END => { - load_unaligned!(ptr, v1, v2, v3); - (compress(compress(v1, v2), v3), len - VECTOR_SIZE * 3, ptr) - }, - _ => { - compress_many(ptr, create_empty(), len) - } - }; + if likely(len <= VECTOR_SIZE) { + // Input fits on a single SIMD vector, however we might read beyond the input message + // Thus we need this safe method that checks if it can safely read beyond or must copy + return get_partial(ptr, len); + } + + let remaining_bytes = len % VECTOR_SIZE; - if remaining_bytes > 0 { - hash_vector = compress(hash_vector, get_partial(p, remaining_bytes)) + // The input does not fit on a single SIMD vector + let hash_vector: State; + if remaining_bytes == 0 { + load_unaligned!(ptr, v0); + hash_vector = v0; + } else { + // If the input length does not match the length of a whole number of SIMD vectors, + // it means we'll need to read a partial vector. We can start with the partial vector first, + // so that we can safely read beyond since we expect the following bytes to still be part of + // the input + hash_vector = get_partial_unsafe(ptr,remaining_bytes as usize); + ptr = ptr.byte_add(remaining_bytes); } - hash_vector + if len <= VECTOR_SIZE * 2 { + // Fast path when input length > 16 and <= 32 + load_unaligned!(ptr, v0); + compress(hash_vector, v0) + } else if len <= VECTOR_SIZE * 3 { + // Fast path when input length > 32 and <= 48 + load_unaligned!(ptr, v0, v1); + compress(hash_vector, compress(v0, v1)) + } else if len <= VECTOR_SIZE * 4 { + // Fast path when input length > 48 and <= 64 + load_unaligned!(ptr, v0, v1, v2); + compress(hash_vector, compress(compress(v0, v1), v2)) + } else { + // Input message is large and we can use the high ILP loop + compress_many(ptr, hash_vector, len) + } } #[inline(always)] -unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> (State, usize, *const State) { +unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State { const UNROLL_FACTOR: usize = 8; @@ -143,8 +150,7 @@ unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_byt hash_vector = compress(hash_vector, v0); } - let remaining_bytes: usize = remaining_bytes & (VECTOR_SIZE - 1); - (hash_vector, remaining_bytes, ptr) + hash_vector } #[cfg(test)] @@ -269,6 +275,7 @@ mod tests { fn is_stable() { assert_eq!(456576800, gxhash32(&[0u8; 0], 0)); assert_eq!(978957914, gxhash32(&[0u8; 1], 0)); - assert_eq!(3128839713, gxhash32(&[42u8; 1000], 1234)); + assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0)); + assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42)); } } \ No newline at end of file diff --git a/src/gxhash/platform/arm_128.rs b/src/gxhash/platform/arm_128.rs index 0fd79a0..44b875e 100644 --- a/src/gxhash/platform/arm_128.rs +++ b/src/gxhash/platform/arm_128.rs @@ -20,8 +20,8 @@ pub unsafe fn create_empty() -> State { } #[inline(always)] -pub unsafe fn create_seed(seed: i32) -> State { - vreinterpretq_s8_s32(vdupq_n_s32(seed)) +pub unsafe fn create_seed(seed: i64) -> State { + vreinterpretq_s8_s64(vdupq_n_s64(seed)) } #[inline(always)] @@ -31,27 +31,30 @@ pub unsafe fn load_unaligned(p: *const State) -> State { #[inline(always)] pub unsafe fn get_partial(p: *const State, len: usize) -> State { - let partial_vector: State; if likely(check_same_page(p)) { - // Unsafe (hence the check) but much faster - let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); - let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); - partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8); + get_partial_unsafe(p, len) } else { - partial_vector = get_partial_safe(p as *const i8, len as usize); + get_partial_safe(p, len) } - // Prevents padded zeroes to introduce bias - return vaddq_s8(partial_vector, vdupq_n_s8(len as i8)); } #[inline(never)] -unsafe fn get_partial_safe(data: *const i8, len: usize) -> State { +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros let mut buffer = [0i8; VECTOR_SIZE]; // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); // Load the buffer into a __m256i vector - vld1q_s8(buffer.as_ptr()) + let partial_vector = vld1q_s8(buffer.as_ptr()); + vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) +} + +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); + let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8); + vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) } #[inline(always)] diff --git a/src/gxhash/platform/x86_128.rs b/src/gxhash/platform/x86_128.rs index 53eab35..ed16d74 100644 --- a/src/gxhash/platform/x86_128.rs +++ b/src/gxhash/platform/x86_128.rs @@ -4,47 +4,51 @@ use super::*; pub type State = __m128i; -#[inline] +#[inline(always)] pub unsafe fn create_empty() -> State { _mm_setzero_si128() } #[inline(always)] -pub unsafe fn create_seed(seed: i32) -> State { - _mm_set1_epi32(seed) +pub unsafe fn create_seed(seed: i64) -> State { + _mm_set1_epi64x(seed) } -#[inline] +#[inline(always)] pub unsafe fn load_unaligned(p: *const State) -> State { _mm_loadu_si128(p) } -#[inline] +#[inline(always)] pub unsafe fn get_partial(p: *const State, len: usize) -> State { - let partial_vector: State; // Safety check if check_same_page(p) { - let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); - partial_vector = _mm_and_si128(_mm_loadu_si128(p), mask); + get_partial_unsafe(p, len as usize) } else { - partial_vector = get_partial_safe(p as *const u8, len as usize) + get_partial_safe(p, len as usize) } - // Prevents padded zeroes to introduce bias - _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) } -#[inline] -unsafe fn get_partial_safe(data: *const u8, len: usize) -> State { +#[inline(always)] +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros - let mut buffer = [0u8; VECTOR_SIZE]; + let mut buffer = [0i8; VECTOR_SIZE]; // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); // Load the buffer into a __m256i vector - _mm_loadu_si128(buffer.as_ptr() as *const State) + let partial_vector = _mm_loadu_epi8(buffer.as_ptr()); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) } -#[inline] +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) +} + +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress(a: State, b: State) -> State { let keys_1 = _mm_set_epi32(0xF2784542, 0xB09D3E21, 0x89C222E5, 0xFC3BC28E); @@ -56,13 +60,13 @@ pub unsafe fn compress(a: State, b: State) -> State { return _mm_aesenclast_si128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress_fast(a: State, b: State) -> State { return _mm_aesenc_si128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn finalize(hash: State, seed: State) -> State { // Hardcoded AES keys diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs index 5f96ad1..363dc15 100644 --- a/src/gxhash/platform/x86_256.rs +++ b/src/gxhash/platform/x86_256.rs @@ -4,47 +4,51 @@ use super::*; pub type State = __m256i; -#[inline] +#[inline(always)] pub unsafe fn create_empty() -> State { _mm256_setzero_si256() } #[inline(always)] -pub unsafe fn create_seed(seed: i32) -> State { - _mm256_set1_epi32(seed) +pub unsafe fn create_seed(seed: i64) -> State { + _mm256_set1_epi64x(seed) } -#[inline] +#[inline(always)] pub unsafe fn load_unaligned(p: *const State) -> State { _mm256_loadu_si256(p) } -#[inline] +#[inline(always)] pub unsafe fn get_partial(p: *const State, len: usize) -> State { - let partial_vector: State; // Safety check if check_same_page(p) { - let indices = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); - partial_vector = _mm256_and_si256(_mm256_loadu_si256(p), mask); + get_partial_unsafe(p, len as usize) } else { - partial_vector = get_partial_safe(p as *const u8, len as usize) + get_partial_safe(p, len as usize) } - // Prevents padded zeroes to introduce bias - _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) } -#[inline] -unsafe fn get_partial_safe(data: *const u8, len: usize) -> State { +#[inline(always)] +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { // Temporary buffer filled with zeros - let mut buffer = [0u8; VECTOR_SIZE]; + let mut buffer = [0i8; VECTOR_SIZE]; // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); // Load the buffer into a __m256i vector - _mm256_loadu_si256(buffer.as_ptr() as *const State) + let partial_vector = _mm256_loadu_epi8(buffer.as_ptr()); + _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) } -#[inline] +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); + let partial_vector = _mm256_and_si256(_mm256_loadu_si256(data), mask); + _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) +} + +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress(a: State, b: State) -> State { let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781); @@ -56,13 +60,13 @@ pub unsafe fn compress(a: State, b: State) -> State { return _mm256_aesenclast_epi128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn compress_fast(a: State, b: State) -> State { return _mm256_aesenc_epi128(a, b); } -#[inline] +#[inline(always)] #[allow(overflowing_literals)] pub unsafe fn finalize(hash: State, seed: State) -> State { // Hardcoded AES keys diff --git a/src/hasher.rs b/src/hasher.rs index 1991bdc..0b49407 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -31,7 +31,7 @@ impl GxHasher { /// println!("Hash is {:x}!", hasher.finish()); /// ``` #[inline] - pub fn with_seed(seed: i32) -> GxHasher { + pub fn with_seed(seed: i64) -> GxHasher { // Use gxhash64 to generate an initial state from a seed GxHasher(unsafe { gxhash(&[], create_seed(seed)) }) } diff --git a/src/lib.rs b/src/lib.rs index 06251ea..89c3d9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ #![feature(core_intrinsics)] +#![feature(pointer_byte_offsets)] #![feature(stdsimd)] +#![feature(stmt_expr_attributes)] mod gxhash; mod hasher;