Skip to content

Commit

Permalink
Cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ogxd committed Nov 10, 2024
1 parent 259e8a4 commit 1ac3e66
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 91 deletions.
8 changes: 0 additions & 8 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
name: Benchmark

on:
pull_request:
branches: [ "main" ]
workflow_dispatch:

env:
Expand All @@ -19,9 +17,6 @@ jobs:
- name: Update rust
run: rustup update

- name: Switch to nightly rust
run: rustup default nightly

- name: Benchmark
run: cargo bench --bench throughput --features bench-plot

Expand Down Expand Up @@ -61,9 +56,6 @@ jobs:
- name: Update rust
run: rustup update

- name: Switch to nightly rust
run: rustup default nightly

- name: Benchmark
run: cargo bench --bench throughput --features bench-plot

Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ The `hybrid` feature flag enables a hybrid implementation of GxHash. This is dis
## Benchmarks

[![Benchmark](https://github.com/ogxd/gxhash/actions/workflows/bench.yml/badge.svg)](https://github.com/ogxd/gxhash/actions/workflows/bench.yml)
GxHash is continuously benchmarked on X86 and ARM GitHub runners.
GxHash is continuously benchmarked on X86 and ARM GitHub runners.

Important: If performance if a critical feature for your application, don't forget to benchmark the cost of hashing in your own context. Numbers shared here may be radically different in your environment and with your hardware.

To run the benchmarks locally use one of the following:
```bash
Expand All @@ -109,16 +111,14 @@ cargo bench --bench throughput
cargo bench --bench hashset
```

Note: The `throughput` benchmark does not relies of criterion of timings measurements. In an attempt of reducing biais in this microbenchmark as much as possible, it shuffles seeds, input data, and alignment. It also has the benefit of being less of a "black box" compared to criterion. There is however a criterion-based throughput benchmark named `throughput_criterion` if you prefer. Results vary slightly between the two benchmarks, don't hesitate to submit an issue if you suspect biais and want to suggest improvements.

Most importantly: if performance if a critical feature for your application, don't forget to benchmark the cost of hashing in your own context. Numbers shared here may be radically different in your environment and with your hardware.

### Throughput

Throughput is measured as the number of bytes hashed per second.

*Some prefer talking of **latency** (time for generating a hash) or **hashrate** (the number of hashes generated per second) for measuring hash function performance, but those are all equivalent in the end as they all boil down to measuring the time it takes to hash some input and then apply different scalar transformation. For instance, if latency for a `4 bytes` hash is `1 ms`, then the throughput is `1 / 0.001 * 4 = 4000 bytes per second`. Throughput allows us to conveniently compare the performance of a hash function for any input size on a single graph.*

The `throughput` benchmark is custom (it does not rely on criterion.rs). In an attempt of reducing biais in this microbenchmark as much as possible, it shuffles seeds, input data, and alignment. It also has the benefit of being less of a "black box" compared to criterion. There is however a criterion-based throughput benchmark named `throughput_criterion` if you prefer. Results vary slightly between the two benchmarks, don't hesitate to submit an issue if you suspect biais and want to suggest improvements.

**Latest Benchmark Results:**
![aarch64](./benches/throughput/aarch64.svg)
![x86_64](./benches/throughput/x86_64.svg)
Expand Down
27 changes: 2 additions & 25 deletions src/gxhash/platform/arm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
vld1q_s8(p as *const i8)
}

// Rarely called, it's worth not inlining it to reduce code size
#[inline(never)]
pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// Temporary buffer filled with zeros
Expand All @@ -34,25 +35,10 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// Load the buffer into a __m256i vector
let partial_vector = vld1q_s8(buffer.as_ptr());
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))

//let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
//let mask = vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(len as i8), indices));

// Using simd_masked_load
// State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(vdupq_n_s8(len as i8))))
// std::intrinsics::simd::simd_masked_load(mask, data as *const i8, vdupq_n_s8(len as i8))

// Using std::simd
// use std::simd::*;
// use std::mem::transmute;
// let slice = std::slice::from_raw_parts(data as *const i8, len);
// let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
// let vector: State = transmute(data);
// return vector;
}

#[inline(always)]
pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State {
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
use std::arch::asm;
Expand All @@ -62,19 +48,10 @@ pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State
src = in(reg) data, out("v2") result,
options(nomem, nostack)
);
//let result = load_unaligned(data);
let partial_vector = vandq_s8(result, vreinterpretq_s8_u8(mask));
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
}

#[inline(always)]
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
let partial_vector = vandq_s8(load_unaligned(data), vreinterpretq_s8_u8(mask));
vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
}

#[inline(always)]
// See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
pub unsafe fn aes_encrypt(data: State, keys: State) -> State {
Expand Down
4 changes: 1 addition & 3 deletions src/gxhash/platform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@ const PAGE_SIZE: usize = 0x1000;
pub unsafe fn get_partial(p: *const State, len: usize) -> State {
// Safety check
if check_same_page(p) {
get_partial_unsafe_no_ub(p, len)
get_partial_unsafe(p, len)
} else {
get_partial_safe(p, len)
}

//get_partial_safe(p, len)
}

#[inline(always)]
Expand Down
52 changes: 2 additions & 50 deletions src/gxhash/platform/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,42 +28,18 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
_mm_loadu_si128(p)
}

// Rarely called, it's worth not inlining it to reduce code size
#[inline(never)]
pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// Temporary buffer filled with zeros
let mut buffer = [0i8; VECTOR_SIZE];
core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
_mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))

// Using URBD
//get_partial_unsafe(data, len)

// Using simd_masked_load
// let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
// let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
// State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8))))

// Using std::simd
// use std::simd::*;
// use std::mem::transmute;
// let slice = std::slice::from_raw_parts(data as *const i8, len);
// let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
// let vector: State = transmute(data);
// return vector;

// Using inline assembly to load out-of-bounds
// use std::arch::asm;
// let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
// let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
// let mut result: State;
// asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) result, options(pure, nomem, nostack));
// let partial_vector = _mm_and_si128(result, mask);
// _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
}

#[inline(always)]
pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State {
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
// Using inline assembly to load out-of-bounds
use std::arch::asm;
let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Expand All @@ -72,30 +48,6 @@ pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State
asm!("movdqu {0}, [{1}]", out(xmm_reg) result, in(reg) data, options(pure, nomem, nostack));
let partial_vector = _mm_and_si128(result, mask);
_mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))

// Using simd_masked_load
// let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
// let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
// State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8))))

// Using std::simd
// use std::simd::*;
// use std::mem::transmute;
// let slice = std::slice::from_raw_parts(data as *const i8, len);
// let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
// let vector: State = transmute(data);
// return vector;

//return get_partial_safe(data, len);
}

#[inline(always)]
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
let d: __m128i = _mm_loadu_si128(data);
let partial_vector = _mm_and_si128(d, mask);
_mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
}

#[inline(always)]
Expand Down

0 comments on commit 1ac3e66

Please sign in to comment.