Skip to content

Commit

Permalink
Read partial vector first
Browse files Browse the repository at this point in the history
  • Loading branch information
ogxd committed Nov 17, 2023
1 parent 69b7c7c commit 8bccbe4
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 137 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ jobs:
- name: Switch to nightly
run: rustup default nightly
- name: Build
run: cargo build --verbose
run: cargo build --release
- name: Run tests
run: cargo test --verbose
run: cargo test --release
- name: Benchmark
run: cargo bench --bench throughput
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "gxhash"
author = "Olivier Giniaux"
version = "1.1.1"
authors = ["Olivier Giniaux"]
version = "2.0.0"
edition = "2021"
description = "GxHash non-cryptographic algorithm"
license = "MIT"
Expand Down
42 changes: 21 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,30 +46,30 @@ To run the benchmarks: `cargo bench --bench throughput`.

### Intel Ice Lake (x86 64-bit) (GCP n2-standard-2)

| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 |
|-------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:|
| gxhash-avx2 | 4021.94 | 16113.58 | 42936.69 | 72145.2 | 94127.12 | 98261.24 | 100333.4 |
| gxhash | 6122.63 | 24476.94 | 25591.9 | 51949.28 | 61253.58 | 64774.75 | 65708.38 |
| xxhash | 915.69 | 4266.94 | 10339.13 | 10116.71 | 17164.93 | 20135.65 | 22834.07 |
| ahash | 1838.59 | 8712.95 | 22473.84 | 25958.66 | 35090.25 | 38440.04 | 39308.7 |
| t1ha0 | 740.15 | 2707.93 | 8572.39 | 28659.06 | 51202.34 | 59918.76 | 65902.36 |
| seahash | 213.04 | 620.54 | 1762.72 | 2473.87 | 2761.71 | 2837.24 | 2860.51 |
| metrohash | 754.55 | 2556.83 | 5983.26 | 10395.86 | 12738.02 | 13492.63 | 13624.54 |
| highwayhash | 122.52 | 490.89 | 3278.71 | 7057.25 | 9726.72 | 10743.01 | 11036.79 |
| fnv-1a | 1169.76 | 3062.36 | 1602.71 | 933.96 | 833.82 | 811.77 | 808.07 |
| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 |
|-------------|-----:|------:|------:|------:|------:|-------:|-------:|
| gxhash-avx2 | 4189 | 16734 | 46142 | 72679 | 96109 | 102202 | 100845 |
| gxhash | 6069 | 24283 | 29465 | 49542 | 58164 | 62511 | 64281 |
| xxhash | 915 | 4266 | 10339 | 10116 | 17164 | 20135 | 22834 |
| ahash | 1838 | 8712 | 22473 | 25958 | 35090 | 38440 | 39308 |
| t1ha0 | 740 | 2707 | 8572 | 28659 | 51202 | 59918 | 65902 |
| seahash | 213 | 620 | 1762 | 2473 | 2761 | 2837 | 2860 |
| metrohash | 754 | 2556 | 5983 | 10395 | 12738 | 13492 | 13624 |
| highwayhash | 122 | 490 | 3278 | 7057 | 9726 | 10743 | 11036 |
| fnv-1a | 1169 | 3062 | 1602 | 933 | 833 | 811 | 808 |

### Macbook M1 Pro (ARM 64-bit)

| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 |
|--------------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:|
| gxhash | 5441.06 | 21635.99 | 26282.95 | 59859.19 | 70175.71 | 74723.96 | 75020.74 |
| xxhash | 1407.55 | 5638.49 | 11432.47 | 8380.32 | 16289.65 | 18690.69 | 19310.57 |
| ahash | 1471.71 | 5920.45 | 15597.47 | 22280.2 | 28672.62 | 29631 | 31174.07 |
| t1ha0 | 1181.94 | 4254.77 | 10277.71 | 15459.97 | 14120.73 | 13741.89 | 13743.4 |
| seahash | 1130 | 4428.8 | 8756.7 | 9248.1 | 8357.73 | 8085.24 | 8056.4 |
| metrohash | 1094.4 | 3389.34 | 9709.14 | 14431.34 | 17470 | 17679.48 | 17931.1 |
| highwayhash | 182.95 | 743.38 | 2696.71 | 5196.88 | 6573.42 | 7061.91 | 7170.97 |
| fnv-1a | 1988.88 | 2627.51 | 1407.3 | 896.08 | 777.74 | 753.23 | 745.68 |
| Method | 4 | 16 | 64 | 256 | 1024 | 4096 | 16384 |
|-------------|-----:|------:|------:|------:|------:|------:|-------:|
| gxhash | 6192 | 24901 | 31770 | 59465 | 72476 | 74723 | 76746 |
| xxhash | 1407 | 5638 | 11432 | 8380 | 16289 | 18690 | 19310 |
| ahash | 1471 | 5920 | 15597 | 22280 | 28672 | 29631 | 31174 |
| t1ha0 | 1181 | 4254 | 10277 | 15459 | 14120 | 13741 | 13743 |
| seahash | 1130 | 4428 | 8756 | 9248 | 8357 | 8085 | 8056 |
| metrohash | 1094 | 3389 | 9709 | 14431 | 17470 | 17679 | 17931 |
| highwayhash | 182 | 743 | 2696 | 5196 | 6573 | 7061 | 7170 |
| fnv-1a | 1988 | 2627 | 1407 | 896 | 777 | 753 | 745 |

## Debugging
The algorithm is mostly inlined, making most profilers fail at providing useful intrinsics. The best I could achieve is profiling at assembly level. [cargo-asm](https://github.com/gnzlbg/cargo-asm) is an easy way to view the actual generated assembly code (`cargo asm gxhash::gxhash::gxhash`). [AMD μProf](https://www.amd.com/en/developer/uprof.html) gives some useful insights on time spent per instruction.
Expand Down
40 changes: 21 additions & 19 deletions benches/throughput/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use rand::Rng;
use gxhash::*;

const ITERATIONS: u32 = 1000;
const MAX_RUN_DURATION: Duration = Duration::from_millis(500);
const MAX_RUN_DURATION: Duration = Duration::from_millis(1000);
const FORCE_NO_INLINING: bool = false;

fn main() {
Expand All @@ -31,13 +31,13 @@ fn main() {

// GxHash
let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" };
benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i32| -> u64 {
benchmark(&mut processor, slice, algo_name, |data: &[u8], seed: i64| -> u64 {
gxhash64(data, seed)
});

// XxHash (twox-hash)
benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: i32| -> u64 {
twox_hash::xxh3::hash64_with_seed(data, seed as u64)
benchmark(&mut processor, slice, "xxhash", |data: &[u8], seed: u64| -> u64 {
twox_hash::xxh3::hash64_with_seed(data, seed)
});

// AHash
Expand All @@ -47,13 +47,13 @@ fn main() {
});

// T1ha0
benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: i32| -> u64 {
t1ha::t1ha0(data, seed as u64)
benchmark(&mut processor, slice, "t1ha0", |data: &[u8], seed: u64| -> u64 {
t1ha::t1ha0(data, seed)
});

// SeaHash
benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: i32| -> u64 {
seahash::hash_seeded(data, seed as u64, 0, 0, 0)
benchmark(&mut processor, slice, "seahash", |data: &[u8], seed: u64| -> u64 {
seahash::hash_seeded(data, seed, 0, 0, 0)
});

// MetroHash
Expand All @@ -70,8 +70,8 @@ fn main() {
});

// FNV-1a
benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: i32| -> u64 {
let mut fnv_hasher = fnv::FnvHasher::with_key(seed as u64);
benchmark(&mut processor, slice, "fnv-1a", |data: &[u8], seed: u64| -> u64 {
let mut fnv_hasher = fnv::FnvHasher::with_key(seed);
fnv_hasher.write(data);
fnv_hasher.finish()
});
Expand All @@ -80,8 +80,8 @@ fn main() {
unsafe { dealloc(ptr, layout) };
}

fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
where F: Fn(&[u8], i32) -> u64
fn benchmark<F, S>(processor: &mut ResultProcessor, data: &[u8], name: &str, delegate: F)
where F: Fn(&[u8], S) -> u64, S: Default + TryFrom<u128> + TryInto<usize>
{
processor.on_start(name);
for i in 2.. {
Expand All @@ -91,21 +91,23 @@ fn benchmark<F>(processor: &mut ResultProcessor, data: &[u8], name: &str, delega
}

// Warmup
black_box(time(ITERATIONS, &|| delegate(&data[..len], 0)));
black_box(time(ITERATIONS, &|| delegate(&data[..len], S::default())));

let mut total_duration: Duration = Duration::ZERO;
let mut runs: usize = 0;
let now = Instant::now();
while now.elapsed() < MAX_RUN_DURATION {
// Prevent optimizations from predictable seed
let seed = total_duration.as_nanos() as i32;
// Prevent optimizations from predictable slice
// Also makes the benchmark use both aligned on unaligned data
let start = seed as usize & 0xFF;
// Make seed unpredictable to prevent optimizations
let seed = S::try_from(total_duration.as_nanos())
.unwrap_or_else(|_| panic!("Something went horribly wrong!"));
// Offset slice by an unpredictable amount to prevent optimization (pre caching)
// and make the benchmark use both aligned and unaligned data
let start = S::try_into(seed)
.unwrap_or_else(|_| panic!("Something went horribly wrong!")) & 0xFF;
let end = start + len;
let slice = &data[start..end];
// Execute method for a new iterations
total_duration += time(ITERATIONS, &|| delegate(slice, seed));
total_duration += time(ITERATIONS, &|| delegate(slice, S::default()));
runs += 1;
}
let throughput = (len as f64) / (1024f64 * 1024f64 * (total_duration.as_secs_f64() / runs as f64 / ITERATIONS as f64));
Expand Down
85 changes: 46 additions & 39 deletions src/gxhash/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
pub(crate) mod platform;

use std::intrinsics::likely;

use platform::*;

/// Hashes an arbitrary stream of bytes to an u32.
Expand All @@ -12,7 +14,7 @@ use platform::*;
/// println!("Hash is {:x}!", gxhash::gxhash32(&bytes, seed));
/// ```
#[inline(always)]
pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
pub fn gxhash32(input: &[u8], seed: i64) -> u32 {
unsafe {
let p = &gxhash(input, create_seed(seed)) as *const State as *const u32;
*p
Expand All @@ -29,7 +31,7 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
/// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed));
/// ```
#[inline(always)]
pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
unsafe {
let p = &gxhash(input, create_seed(seed)) as *const State as *const u64;
*p
Expand All @@ -46,7 +48,7 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
/// println!("Hash is {:x}!", gxhash::gxhash128(&bytes, seed));
/// ```
#[inline(always)]
pub fn gxhash128(input: &[u8], seed: i32) -> u128 {
pub fn gxhash128(input: &[u8], seed: i64) -> u128 {
unsafe {
let p = &gxhash(input, create_seed(seed)) as *const State as *const u128;
*p
Expand All @@ -58,18 +60,12 @@ macro_rules! load_unaligned {
$(
#[allow(unused_mut)]
let mut $var = load_unaligned($ptr);
#[allow(unused_assignments)]
$ptr = ($ptr).offset(1);
)+
};
}

const RANGE_1_BEGIN: usize = VECTOR_SIZE + 1;
const RANGE_1_END: usize = VECTOR_SIZE * 2;
const RANGE_2_BEGIN: usize = RANGE_1_BEGIN + 1;
const RANGE_2_END: usize = VECTOR_SIZE * 3;
const RANGE_3_BEGIN: usize = RANGE_2_BEGIN + 1;
const RANGE_3_END: usize = VECTOR_SIZE * 4;

#[inline(always)]
pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
finalize(compress_all(input), seed)
Expand All @@ -78,40 +74,51 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
#[inline(always)]
unsafe fn compress_all(input: &[u8]) -> State {

let len: usize = input.len();
let len = input.len();
let mut ptr = input.as_ptr() as *const State;

let (mut hash_vector, remaining_bytes, p) = match len {
// Fast path with no compression for payloads that fit in a single state
0..=VECTOR_SIZE => {
(get_partial(ptr, len), 0, ptr)
},
RANGE_1_BEGIN..=RANGE_1_END => {
load_unaligned!(ptr, v1);
(v1, len - VECTOR_SIZE, ptr)
},
RANGE_2_BEGIN..=RANGE_2_END => {
load_unaligned!(ptr, v1, v2);
(compress(v1, v2), len - VECTOR_SIZE * 2, ptr)
},
RANGE_3_BEGIN..=RANGE_3_END => {
load_unaligned!(ptr, v1, v2, v3);
(compress(compress(v1, v2), v3), len - VECTOR_SIZE * 3, ptr)
},
_ => {
compress_many(ptr, create_empty(), len)
}
};
if likely(len <= VECTOR_SIZE) {
// Input fits on a single SIMD vector, however we might read beyond the input message
// Thus we need this safe method that checks if it can safely read beyond or must copy
return get_partial(ptr, len);
}

let remaining_bytes = len % VECTOR_SIZE;

if remaining_bytes > 0 {
hash_vector = compress(hash_vector, get_partial(p, remaining_bytes))
// The input does not fit on a single SIMD vector
let hash_vector: State;
if remaining_bytes == 0 {
load_unaligned!(ptr, v0);
hash_vector = v0;
} else {
// If the input length does not match the length of a whole number of SIMD vectors,
// it means we'll need to read a partial vector. We can start with the partial vector first,
// so that we can safely read beyond since we expect the following bytes to still be part of
// the input
hash_vector = get_partial_unsafe(ptr,remaining_bytes as usize);
ptr = ptr.byte_add(remaining_bytes);
}

hash_vector
if len <= VECTOR_SIZE * 2 {
// Fast path when input length > 16 and <= 32
load_unaligned!(ptr, v0);
compress(hash_vector, v0)
} else if len <= VECTOR_SIZE * 3 {
// Fast path when input length > 32 and <= 48
load_unaligned!(ptr, v0, v1);
compress(hash_vector, compress(v0, v1))
} else if len <= VECTOR_SIZE * 4 {
// Fast path when input length > 48 and <= 64
load_unaligned!(ptr, v0, v1, v2);
compress(hash_vector, compress(compress(v0, v1), v2))
} else {
// Input message is large and we can use the high ILP loop
compress_many(ptr, hash_vector, len)
}
}

#[inline(always)]
unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> (State, usize, *const State) {
unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State {

const UNROLL_FACTOR: usize = 8;

Expand Down Expand Up @@ -143,8 +150,7 @@ unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_byt
hash_vector = compress(hash_vector, v0);
}

let remaining_bytes: usize = remaining_bytes & (VECTOR_SIZE - 1);
(hash_vector, remaining_bytes, ptr)
hash_vector
}

#[cfg(test)]
Expand Down Expand Up @@ -269,6 +275,7 @@ mod tests {
fn is_stable() {
assert_eq!(456576800, gxhash32(&[0u8; 0], 0));
assert_eq!(978957914, gxhash32(&[0u8; 1], 0));
assert_eq!(3128839713, gxhash32(&[42u8; 1000], 1234));
assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0));
assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42));
}
}
29 changes: 16 additions & 13 deletions src/gxhash/platform/arm_128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ pub unsafe fn create_empty() -> State {
}

#[inline(always)]
pub unsafe fn create_seed(seed: i32) -> State {
vreinterpretq_s8_s32(vdupq_n_s32(seed))
pub unsafe fn create_seed(seed: i64) -> State {
vreinterpretq_s8_s64(vdupq_n_s64(seed))
}

#[inline(always)]
Expand All @@ -31,27 +31,30 @@ pub unsafe fn load_unaligned(p: *const State) -> State {

#[inline(always)]
pub unsafe fn get_partial(p: *const State, len: usize) -> State {
let partial_vector: State;
if likely(check_same_page(p)) {
// Unsafe (hence the check) but much faster
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8);
get_partial_unsafe(p, len)
} else {
partial_vector = get_partial_safe(p as *const i8, len as usize);
get_partial_safe(p, len)
}
// Prevents padded zeroes to introduce bias
return vaddq_s8(partial_vector, vdupq_n_s8(len as i8));
}

#[inline(never)]
unsafe fn get_partial_safe(data: *const i8, len: usize) -> State {
pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// Temporary buffer filled with zeros
let mut buffer = [0i8; VECTOR_SIZE];
// Copy data into the buffer
std::ptr::copy(data, buffer.as_mut_ptr(), len);
std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
// Load the buffer into a __m256i vector
vld1q_s8(buffer.as_ptr())
let partial_vector = vld1q_s8(buffer.as_ptr());
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
}

#[inline(always)]
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8);
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
}

#[inline(always)]
Expand Down
Loading

0 comments on commit 8bccbe4

Please sign in to comment.