Skip to content

Commit

Permalink
Read partial vector first
Browse files Browse the repository at this point in the history
  • Loading branch information
ogxd committed Nov 17, 2023
1 parent 69b7c7c commit 309fd66
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 116 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ jobs:
- name: Switch to nightly
run: rustup default nightly
- name: Build
run: cargo build --verbose
run: cargo build --release
- name: Run tests
run: cargo test --verbose
run: cargo test --release
- name: Benchmark
run: cargo bench --bench throughput
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "gxhash"
author = "Olivier Giniaux"
version = "1.1.1"
authors = ["Olivier Giniaux"]
version = "2.0.0"
edition = "2021"
description = "GxHash non-cryptographic algorithm"
license = "MIT"
Expand Down
40 changes: 21 additions & 19 deletions benches/throughput/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use rand::Rng;
use gxhash::*;

const ITERATIONS: u32 = 1000;
const MAX_RUN_DURATION: Duration = Duration::from_millis(500);
const MAX_RUN_DURATION: Duration = Duration::from_millis(1000);
const FORCE_NO_INLINING: bool = false;

fn main() {
Expand All @@ -31,13 +31,13 @@ fn main() {

// GxHash
let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" };
benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i32| -> u64 {
benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i64| -> u64 {
gxhash64(data, seed)
});

// XxHash (twox-hash)
benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: i32| -> u64 {
twox_hash::xxh3::hash64_with_seed(data, seed as u64)
benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: u64| -> u64 {
twox_hash::xxh3::hash64_with_seed(data, seed)
});

// AHash
Expand All @@ -47,13 +47,13 @@ fn main() {
});

// T1ha0
benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: i32| -> u64 {
t1ha::t1ha0(data, seed as u64)
benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: u64| -> u64 {
t1ha::t1ha0(data, seed)
});

// SeaHash
benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: i32| -> u64 {
seahash::hash_seeded(data, seed as u64, 0, 0, 0)
benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: u64| -> u64 {
seahash::hash_seeded(data, seed, 0, 0, 0)
});

// MetroHash
Expand All @@ -70,8 +70,8 @@ fn main() {
});

// FNV-1a
benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: i32| -> u64 {
let mut fnv_hasher = fnv::FnvHasher::with_key(seed as u64);
benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: u64| -> u64 {
let mut fnv_hasher = fnv::FnvHasher::with_key(seed);
fnv_hasher.write(data);
fnv_hasher.finish()
});
Expand All @@ -80,8 +80,8 @@ fn main() {
unsafe { dealloc(ptr, layout) };
}

fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
where F: Fn(&[u8], i32) -> u64
fn benchmark<F, S>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
where F: Fn(&[u8], S) -> u64, S: Default + TryFrom<u128> + TryInto<usize>
{
processor.on_start(name);
for i in 2.. {
Expand All @@ -91,21 +91,23 @@ fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delega
}

// Warmup
black_box(time(ITERATIONS, &|| delegate(&data[..len], 0)));
black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default())));

let mut total_duration: Duration = Duration::ZERO;
let mut runs: usize = 0;
let now = Instant::now();
while now.elapsed() < MAX_RUN_DURATION {
// Prevent optimizations from predictable seed
let seed = total_duration.as_nanos() as i32;
// Prevent optimizations from predictable slice
// Also makes the benchmark use both aligned on unaligned data
let start = seed as usize & 0xFF;
// Make seed unpredictable to prevent optimizations
let seed = S::try_from(total_duration.as_nanos())
.unwrap_or_else(|_| panic!("Something went horribly wrong!"));
// Offset slice by an unpredictable amount to prevent optimization (pre caching)
// and make the benchmark use both aligned and unaligned data
let start = S::try_into(seed)
.unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF;
let end = start + len;
let slice = &data[start..end];
// Execute method for a new iterations
total_duration += time(ITERATIONS, &|| delegate(slice, seed));
total_duration += time(ITERATIONS, &|| delegate(slice, S::default()));
runs += 1;
}
let throughput = (len as f64) / (1024f64 * 1024f64 * (total_duration.as_secs_f64() / runs as f64 / ITERATIONS as f64));
Expand Down
85 changes: 46 additions & 39 deletions src/gxhash/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
pub(crate) mod platform;

use std::intrinsics::likely;

use platform::*;

/// Hashes an arbitrary stream of bytes to an u32.
Expand All @@ -12,7 +14,7 @@ use platform::*;
/// println!("Hash is {:x}!", gxhash::gxhash32(&bytes, seed));
/// ```
#[inline(always)]
pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
pub fn gxhash32(input: &[u8], seed: i64) -> u32 {
unsafe {
let p = &gxhash(input, create_seed(seed)) as *const State as *const u32;
*p
Expand All @@ -29,7 +31,7 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
/// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed));
/// ```
#[inline(always)]
pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
unsafe {
let p = &gxhash(input, create_seed(seed)) as *const State as *const u64;
*p
Expand All @@ -46,7 +48,7 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
/// println!("Hash is {:x}!", gxhash::gxhash128(&bytes, seed));
/// ```
#[inline(always)]
pub fn gxhash128(input: &[u8], seed: i32) -> u128 {
pub fn gxhash128(input: &[u8], seed: i64) -> u128 {
unsafe {
let p = &gxhash(input, create_seed(seed)) as *const State as *const u128;
*p
Expand All @@ -58,18 +60,12 @@ macro_rules! load_unaligned {
$(
#[allow(unused_mut)]
let mut $var = load_unaligned($ptr);
#[allow(unused_assignments)]
$ptr = ($ptr).offset(1);
)+
};
}

const RANGE_1_BEGIN: usize = VECTOR_SIZE + 1;
const RANGE_1_END: usize = VECTOR_SIZE * 2;
const RANGE_2_BEGIN: usize = RANGE_1_BEGIN + 1;
const RANGE_2_END: usize = VECTOR_SIZE * 3;
const RANGE_3_BEGIN: usize = RANGE_2_BEGIN + 1;
const RANGE_3_END: usize = VECTOR_SIZE * 4;

#[inline(always)]
pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
finalize(compress_all(input), seed)
Expand All @@ -78,40 +74,51 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
#[inline(always)]
unsafe fn compress_all(input: &[u8]) -> State {

let len: usize = input.len();
let len = input.len();
let mut ptr = input.as_ptr() as *const State;

let (mut hash_vector, remaining_bytes, p) = match len {
// Fast path with no compression for payloads that fit in a single state
0..=VECTOR_SIZE => {
(get_partial(ptr, len), 0, ptr)
},
RANGE_1_BEGIN..=RANGE_1_END => {
load_unaligned!(ptr, v1);
(v1, len - VECTOR_SIZE, ptr)
},
RANGE_2_BEGIN..=RANGE_2_END => {
load_unaligned!(ptr, v1, v2);
(compress(v1, v2), len - VECTOR_SIZE * 2, ptr)
},
RANGE_3_BEGIN..=RANGE_3_END => {
load_unaligned!(ptr, v1, v2, v3);
(compress(compress(v1, v2), v3), len - VECTOR_SIZE * 3, ptr)
},
_ => {
compress_many(ptr, create_empty(), len)
}
};
if likely(len <= VECTOR_SIZE) {
// Input fits on a single SIMD vector, however we might read beyond the input message
// Thus we need this safe method that checks if it can safely read beyond or must copy
return get_partial(ptr, len);
}

let remaining_bytes = len % VECTOR_SIZE;

if remaining_bytes > 0 {
hash_vector = compress(hash_vector, get_partial(p, remaining_bytes))
// The input does not fit on a single SIMD vector
let hash_vector: State;
if remaining_bytes == 0 {
load_unaligned!(ptr, v0);
hash_vector = v0;
} else {
// If the input length does not match the length of a whole number of SIMD vectors,
// it means we'll need to read a partial vector. We can start with the partial vector first,
// so that we can safely read beyond since we expect the following bytes to still be part of
// the input
hash_vector = get_partial_unsafe(ptr,remaining_bytes as usize);
ptr = ptr.byte_add(remaining_bytes);
}

hash_vector
if len <= VECTOR_SIZE * 2 {
// Fast path when input length > 16 and <= 32
load_unaligned!(ptr, v0);
compress(hash_vector, v0)
} else if len <= VECTOR_SIZE * 3 {
// Fast path when input length > 32 and <= 48
load_unaligned!(ptr, v0, v1);
compress(hash_vector, compress(v0, v1))
} else if len <= VECTOR_SIZE * 4 {
// Fast path when input length > 48 and <= 64
load_unaligned!(ptr, v0, v1, v2);
compress(hash_vector, compress(compress(v0, v1), v2))
} else {
// Input message is large and we can use the high ILP loop
compress_many(ptr, hash_vector, len)
}
}

#[inline(always)]
unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> (State, usize, *const State) {
unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State {

const UNROLL_FACTOR: usize = 8;

Expand Down Expand Up @@ -143,8 +150,7 @@ unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_byt
hash_vector = compress(hash_vector, v0);
}

let remaining_bytes: usize = remaining_bytes & (VECTOR_SIZE - 1);
(hash_vector, remaining_bytes, ptr)
hash_vector
}

#[cfg(test)]
Expand Down Expand Up @@ -269,6 +275,7 @@ mod tests {
fn is_stable() {
assert_eq!(456576800, gxhash32(&[0u8; 0], 0));
assert_eq!(978957914, gxhash32(&[0u8; 1], 0));
assert_eq!(3128839713, gxhash32(&[42u8; 1000], 1234));
assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0));
assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42));
}
}
29 changes: 16 additions & 13 deletions src/gxhash/platform/arm_128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ pub unsafe fn create_empty() -> State {
}

#[inline(always)]
pub unsafe fn create_seed(seed: i32) -> State {
vreinterpretq_s8_s32(vdupq_n_s32(seed))
pub unsafe fn create_seed(seed: i64) -> State {
vreinterpretq_s8_s64(vdupq_n_s64(seed))
}

#[inline(always)]
Expand All @@ -31,27 +31,30 @@ pub unsafe fn load_unaligned(p: *const State) -> State {

#[inline(always)]
pub unsafe fn get_partial(p: *const State, len: usize) -> State {
let partial_vector: State;
if likely(check_same_page(p)) {
// Unsafe (hence the check) but much faster
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8);
get_partial_unsafe(p, len)
} else {
partial_vector = get_partial_safe(p as *const i8, len as usize);
get_partial_safe(p, len)
}
// Prevents padded zeroes to introduce bias
return vaddq_s8(partial_vector, vdupq_n_s8(len as i8));
}

#[inline(never)]
unsafe fn get_partial_safe(data: *const i8, len: usize) -> State {
pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// Temporary buffer filled with zeros
let mut buffer = [0i8; VECTOR_SIZE];
// Copy data into the buffer
std::ptr::copy(data, buffer.as_mut_ptr(), len);
std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
// Load the buffer into a __m256i vector
vld1q_s8(buffer.as_ptr())
let partial_vector = vld1q_s8(buffer.as_ptr());
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
}

#[inline(always)]
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8);
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
}

#[inline(always)]
Expand Down
Loading

0 comments on commit 309fd66

Please sign in to comment.