Cleanup

ogxd · Nov 10, 2024 · 1ac3e66 · 1ac3e66
1 parent 259e8a4
commit 1ac3e66
Show file tree

Hide file tree

Showing 5 changed files with 10 additions and 91 deletions.
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -1,8 +1,6 @@
 name: Benchmark
 
 on:
-  pull_request:
-    branches: [ "main" ]
   workflow_dispatch:
 
 env:
@@ -19,9 +17,6 @@ jobs:
     - name: Update rust
       run: rustup update
 
-    - name: Switch to nightly rust
-      run: rustup default nightly
-
     - name: Benchmark
       run: cargo bench --bench throughput --features bench-plot
 
@@ -61,9 +56,6 @@ jobs:
     - name: Update rust
       run: rustup update
 
-    - name: Switch to nightly rust
-      run: rustup default nightly
-
     - name: Benchmark
       run: cargo bench --bench throughput --features bench-plot
 

diff --git a/README.md b/README.md
@@ -97,7 +97,9 @@ The `hybrid` feature flag enables a hybrid implementation of GxHash. This is dis
 ## Benchmarks
 
 [![Benchmark](https://github.com/ogxd/gxhash/actions/workflows/bench.yml/badge.svg)](https://github.com/ogxd/gxhash/actions/workflows/bench.yml)  
-GxHash is continuously benchmarked on X86 and ARM GitHub runners. 
+GxHash is continuously benchmarked on X86 and ARM GitHub runners.  
+
+Important: If performance if a critical feature for your application, don't forget to benchmark the cost of hashing in your own context. Numbers shared here may be radically different in your environment and with your hardware.  
 
 To run the benchmarks locally use one of the following:
 ```bash
@@ -109,16 +111,14 @@ cargo bench --bench throughput
 cargo bench --bench hashset
 ```
 
-Note: The `throughput` benchmark does not relies of criterion of timings measurements. In an attempt of reducing biais in this microbenchmark as much as possible, it shuffles seeds, input data, and alignment. It also has the benefit of being less of a "black box" compared to criterion. There is however a criterion-based throughput benchmark named `throughput_criterion` if you prefer. Results vary slightly between the two benchmarks, don't hesitate to submit an issue if you suspect biais and want to suggest improvements.
-
-Most importantly: if performance if a critical feature for your application, don't forget to benchmark the cost of hashing in your own context. Numbers shared here may be radically different in your environment and with your hardware.
-
 ### Throughput
 
 Throughput is measured as the number of bytes hashed per second.
 
 *Some prefer talking of **latency** (time for generating a hash) or **hashrate** (the number of hashes generated per second) for measuring hash function performance, but those are all equivalent in the end as they all boil down to measuring the time it takes to hash some input and then apply different scalar transformation. For instance, if latency for a `4 bytes` hash is `1 ms`, then the throughput is `1 / 0.001 * 4 = 4000 bytes per second`. Throughput allows us to conveniently compare the performance of a hash function for any input size on a single graph.*
 
+The `throughput` benchmark is custom (it does not rely on criterion.rs). In an attempt of reducing biais in this microbenchmark as much as possible, it shuffles seeds, input data, and alignment. It also has the benefit of being less of a "black box" compared to criterion. There is however a criterion-based throughput benchmark named `throughput_criterion` if you prefer. Results vary slightly between the two benchmarks, don't hesitate to submit an issue if you suspect biais and want to suggest improvements.
+
 **Latest Benchmark Results:**    
 ![aarch64](./benches/throughput/aarch64.svg)
 ![x86_64](./benches/throughput/x86_64.svg)

diff --git a/src/gxhash/platform/arm.rs b/src/gxhash/platform/arm.rs
@@ -25,6 +25,7 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
     vld1q_s8(p as *const i8)
 }
 
+// Rarely called, it's worth not inlining it to reduce code size
 #[inline(never)]
 pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
@@ -34,25 +35,10 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Load the buffer into a __m256i vector
     let partial_vector = vld1q_s8(buffer.as_ptr());
     vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
-
-    //let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
-    //let mask = vreinterpretq_s8_u8(vcgtq_s8(vdupq_n_s8(len as i8), indices));
-
-    // Using simd_masked_load
-    // State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(vdupq_n_s8(len as i8))))
-    // std::intrinsics::simd::simd_masked_load(mask, data as *const i8, vdupq_n_s8(len as i8))
-
-    // Using std::simd
-    // use std::simd::*;
-    // use std::mem::transmute;
-    // let slice = std::slice::from_raw_parts(data as *const i8, len);
-    // let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
-    // let vector: State = transmute(data);
-    // return vector;
 }
 
 #[inline(always)]
-pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State {
+pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
     let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
     let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
     use std::arch::asm;
@@ -62,19 +48,10 @@ pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State
         src = in(reg) data, out("v2") result,
         options(nomem, nostack)
     );
-    //let result = load_unaligned(data);
     let partial_vector = vandq_s8(result, vreinterpretq_s8_u8(mask));
     vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
 }
 
-#[inline(always)]
-pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
-    let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
-    let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
-    let partial_vector = vandq_s8(load_unaligned(data), vreinterpretq_s8_u8(mask));
-    vaddq_s8(partial_vector, vdupq_n_s8(len as i8))
-}
-
 #[inline(always)]
 // See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
 pub unsafe fn aes_encrypt(data: State, keys: State) -> State {

diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs
@@ -18,12 +18,10 @@ const PAGE_SIZE: usize = 0x1000;
 pub unsafe fn get_partial(p: *const State, len: usize) -> State {
     // Safety check
     if check_same_page(p) {
-        get_partial_unsafe_no_ub(p, len)
+        get_partial_unsafe(p, len)
     } else {
         get_partial_safe(p, len)
     }
-
-    //get_partial_safe(p, len)
 }
 
 #[inline(always)]

diff --git a/src/gxhash/platform/x86.rs b/src/gxhash/platform/x86.rs
@@ -28,42 +28,18 @@ pub unsafe fn load_unaligned(p: *const State) -> State {
     _mm_loadu_si128(p)
 }
 
+// Rarely called, it's worth not inlining it to reduce code size
 #[inline(never)]
 pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
     // Temporary buffer filled with zeros
     let mut buffer = [0i8; VECTOR_SIZE];
     core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
     let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
     _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
-
-    // Using URBD
-    //get_partial_unsafe(data, len)
-
-    // Using simd_masked_load
-    // let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    // let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
-    // State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8))))
-
-    // Using std::simd
-    // use std::simd::*;
-    // use std::mem::transmute;
-    // let slice = std::slice::from_raw_parts(data as *const i8, len);
-    // let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
-    // let vector: State = transmute(data);
-    // return vector;
-
-    // Using inline assembly to load out-of-bounds
-    // use std::arch::asm;
-    // let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    // let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
-    // let mut result: State;
-    // asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) result, options(pure, nomem, nostack));
-    // let partial_vector = _mm_and_si128(result, mask);
-    // _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
 }
 
 #[inline(always)]
-pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State {
+pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
     // Using inline assembly to load out-of-bounds
     use std::arch::asm;
     let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -72,30 +48,6 @@ pub unsafe fn get_partial_unsafe_no_ub(data: *const State, len: usize) -> State
     asm!("movdqu {0}, [{1}]", out(xmm_reg) result, in(reg) data, options(pure, nomem, nostack));
     let partial_vector = _mm_and_si128(result, mask);
     _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
-
-    // Using simd_masked_load
-    // let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    // let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
-    // State::from(std::intrinsics::simd::simd_masked_load(core::simd::i8x16::from(mask), data as *const i8, core::simd::i8x16::from(_mm_set1_epi8(len as i8))))
-
-    // Using std::simd
-    // use std::simd::*;
-    // use std::mem::transmute;
-    // let slice = std::slice::from_raw_parts(data as *const i8, len);
-    // let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
-    // let vector: State = transmute(data);
-    // return vector;
-
-    //return get_partial_safe(data, len);
-}
-
-#[inline(always)]
-pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
-    let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
-    let d: __m128i = _mm_loadu_si128(data);
-    let partial_vector = _mm_and_si128(d, mask);
-    _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
 }
 
 #[inline(always)]