From db63372d27f832fba30674b0033fea0ba01a2f57 Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@gmail.com>
Date: Sun, 29 Oct 2023 02:29:28 +0200
Subject: [PATCH] Fix quality for 256-bit state version

---
 .github/workflows/rust.yml     |  4 +++-
 Cargo.toml                     |  2 +-
 src/gxhash/mod.rs              | 28 ----------------------------
 src/gxhash/platform/x86_256.rs |  3 ++-
 4 files changed, 6 insertions(+), 31 deletions(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 2bcc45a..88031f8 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -22,4 +22,6 @@ jobs:
     - name: Build
       run: cargo build --verbose
     - name: Run tests
-      run: cargo test --verbose
\ No newline at end of file
+      run: cargo test --verbose
+    - name: Run tests (AVX2)
+      run: cargo test --verbose --features avx2
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index ad59ed9..29d512b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,7 @@ version = "0.1.0"
 edition = "2021"
 
 [features]
-# The 256-bit state GxHash is faster for large inputs than the default 128-bit state implementation.
+# The 256-bit state GxHash is faster for large inputs than the default 128-bit state implementation, but faster on smaller hashes.
 # Please not however that the 256-bit GxHash and the 128-bit GxHash don't generate the same hashes for a same input.
 # Requires AVX2 and VAES (X86).
 avx2 = []
diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index 5723c3d..4e79def 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -1,7 +1,6 @@
 mod platform;
 use platform::*;
 
-#[cfg(not(feature = "avx2"))]
 #[inline(always)]
 pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
     unsafe {
@@ -10,20 +9,6 @@ pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
     }
 }
 
-// Since the 256-bit runs AES operations on two 128-bit lanes, we need to extract
-// the hash from the center, picking the same entropy amount from the two lanes
-#[cfg(feature = "avx2")]
-#[inline(always)]
-pub fn gxhash32(input: &[u8], seed: i32) -> u32 {
-    unsafe {
-        let p = &gxhash(input, seed) as *const state as *const u8;
-        let offset = std::mem::size_of::<state>() / 2 - std::mem::size_of::<u32>() / 2 - 1;
-        let shifted_ptr = p.offset(offset as isize) as *const u32;
-        *shifted_ptr
-    }
-}
-
-#[cfg(not(feature = "avx2"))]
 #[inline(always)]
 pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
     unsafe {
@@ -32,19 +17,6 @@ pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
     }
 }
 
-// Since the 256-bit runs AES operations on two 128-bit lanes, we need to extract
-// the hash from the center, picking the same entropy amount from the two lanes
-#[cfg(feature = "avx2")]
-#[inline(always)]
-pub fn gxhash64(input: &[u8], seed: i32) -> u64 {
-    unsafe {
-        let p = &gxhash(input, seed) as *const state as *const u8;
-        let offset = std::mem::size_of::<state>() / 2 - std::mem::size_of::<u64>() / 2 - 1;
-        let shifted_ptr = p.offset(offset as isize) as *const u64;
-        *shifted_ptr
-    }
-}
-
 const VECTOR_SIZE: isize = std::mem::size_of::<state>() as isize;
 
 #[inline(always)]
diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs
index b3e855f..c037fe1 100644
--- a/src/gxhash/platform/x86_256.rs
+++ b/src/gxhash/platform/x86_256.rs
@@ -87,5 +87,6 @@ pub unsafe fn finalize(hash: state, seed: i32) -> state {
     hash = _mm256_aesenc_epi128(hash, keys_2);
     hash = _mm256_aesenclast_epi128(hash, keys_3);
 
-    hash
+    let permuted = _mm256_permute2x128_si256(hash, hash, 0x21);
+    _mm256_xor_si256(hash, permuted)
 }
\ No newline at end of file