From 2098bd41680f8ee90bb43e443bb7917b498995b4 Mon Sep 17 00:00:00 2001 From: Matthieu M Date: Sun, 24 Mar 2024 17:47:34 +0100 Subject: [PATCH] WIP Switch to a full bitwidth h2 * Changes: - Use all values of h2, not just 130 of it. - Convert SSE2 implementation for benchmarking. * Motivation: Using 256 values instead of 130 could theoretically lower the number of false-positive residual matches by close to 50%. On the other hand, it does make h2 slightly more complicated to compute, and possibly to operate on. --- src/raw/mod.rs | 217 +++++++++++++++++++++++++++++++++++++++++++++--- src/raw/sse2.rs | 52 ++++++++---- 2 files changed, 241 insertions(+), 28 deletions(-) diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 22c01f5e9..4fe50ac01 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -105,28 +105,29 @@ trait SizedTypeProperties: Sized { impl SizedTypeProperties for T {} /// Control byte value for an empty bucket. -const EMPTY: u8 = 0b1111_1111; +const EMPTY: u8 = 0b0111_1111; /// Control byte value for a deleted bucket. -const DELETED: u8 = 0b1000_0000; +const DELETED: u8 = 0b0111_1110; /// Checks whether a control byte represents a full bucket (top bit is clear). #[inline] fn is_full(ctrl: u8) -> bool { - ctrl & 0x80 == 0 + (ctrl as i8) < (DELETED as i8) } /// Checks whether a control byte represents a special value (top bit is set). #[inline] fn is_special(ctrl: u8) -> bool { - ctrl & 0x80 != 0 + (ctrl as i8) >= (DELETED as i8) } /// Checks whether a special control value is EMPTY (just check 1 bit). #[inline] fn special_is_empty(ctrl: u8) -> bool { debug_assert!(is_special(ctrl)); - ctrl & 0x01 != 0 + + ctrl == EMPTY } /// Primary hash function, used to select the initial bucket to probe from. @@ -137,23 +138,46 @@ fn h1(hash: u64) -> usize { hash as usize } -// Constant for h2 function that grabing the top 7 bits of the hash. +// Constant for h2 function that grabing the top 8 bits of the hash. const MIN_HASH_LEN: usize = if mem::size_of::() < mem::size_of::() { mem::size_of::() } else { mem::size_of::() }; -/// Secondary hash function, saved in the low 7 bits of the control byte. +/// Secondary hash function, saved in the control byte. #[inline] #[allow(clippy::cast_possible_truncation)] fn h2(hash: u64) -> u8 { - // Grab the top 7 bits of the hash. While the hash is normally a full 64-bit + const fn compute_control() -> [u8; 256] { + let mut result = [0; 256]; + + let mut i = 0; + + while i < 256 { + result[i] = i as u8; + + i += 1; + } + + // Avoid overlap with special values. + result[EMPTY as usize] += 8; + result[DELETED as usize] += 8; + + result + } + + #[rustfmt::skip] + const CONTROL: [u8; 256] = compute_control(); + + // Grab the top 8 bits of the hash. While the hash is normally a full 64-bit // value, some hash functions (such as FxHash) produce a usize result // instead, which means that the top 32 bits are 0 on 32-bit platforms. // So we use MIN_HASH_LEN constant to handle this. - let top7 = hash >> (MIN_HASH_LEN * 8 - 7); - (top7 & 0x7f) as u8 // truncation + let top8 = hash >> (MIN_HASH_LEN * 8 - 7); + + // Lookup matching control byte, avoid overlap with special control. + CONTROL[top8 as usize] } /// Probe sequence based on triangular numbers, which is guaranteed (since our @@ -4562,6 +4586,179 @@ impl RawExtractIf<'_, T, A> { } } +#[cfg(test)] +mod test_group { + use super::*; + + type RawGroup = [u8; Group::WIDTH]; + + fn load(raw: RawGroup) -> Group { + // Safety: + // - `raw.len() == Group::WIDTH`. + unsafe { Group::load(raw.as_ptr()) } + } + + fn store(group: Group) -> RawGroup { + #[repr(align(16))] + struct Aligned(RawGroup); + + let mut result = Aligned(RawGroup::default()); + + // Safety: + // - `raw.len() == Group::WIDTH`. + // - `raw` is suitably aligned. + unsafe { group.store_aligned(result.0.as_mut_ptr()) } + + result.0 + } + + #[test] + fn test_match_byte() { + use ::alloc::vec::Vec; + + let mut raw = RawGroup::default(); + + for (i, slot) in raw.iter_mut().enumerate() { + if i % 2 == 0 { + *slot = EMPTY; + } else { + *slot = 0x44; + } + } + + let group = load(raw); + + let is_match = group.match_byte(0x44); + + let matched: Vec<_> = is_match.into_iter().collect(); + + assert_eq!(Group::WIDTH / 2, matched.len(), "{matched:?}"); + assert!(matched.iter().all(|i| *i % 2 != 0), "{matched:?}"); + } + + #[test] + fn test_match_empty() { + use ::alloc::vec::Vec; + + let mut raw = RawGroup::default(); + + for (i, slot) in raw.iter_mut().enumerate() { + if i % 2 == 0 { + *slot = EMPTY; + } else { + *slot = DELETED; + } + } + + let group = load(raw); + + let is_empty = group.match_empty(); + + let empty: Vec<_> = is_empty.into_iter().collect(); + + assert_eq!(Group::WIDTH / 2, empty.len(), "{empty:?}"); + assert!(empty.iter().all(|i| *i % 2 == 0), "{empty:?}"); + } + + #[test] + fn test_match_empty_or_deleted() { + use ::alloc::vec::Vec; + + let mut raw = RawGroup::default(); + + for (i, slot) in raw.iter_mut().enumerate() { + let value = match i % 4 { + 0 => EMPTY, + 1 => 2, + 2 => DELETED, + 3 => 255, + _ => unreachable!("i % 4 < 4"), + }; + + *slot = value; + } + + let group = load(raw); + + let is_empty_or_deleted = group.match_empty_or_deleted(); + + let empty_or_deleted: Vec<_> = is_empty_or_deleted.into_iter().collect(); + + assert_eq!( + Group::WIDTH / 2, + empty_or_deleted.len(), + "{empty_or_deleted:?}" + ); + assert!( + empty_or_deleted.iter().all(|i| *i % 2 == 0), + "{empty_or_deleted:?}" + ); + } + + #[test] + fn test_match_full() { + use ::alloc::vec::Vec; + + let mut raw = RawGroup::default(); + + for (i, slot) in raw.iter_mut().enumerate() { + let value = match i % 4 { + 0 => EMPTY, + 1 => 2, + 2 => DELETED, + 3 => 255, + _ => unreachable!("i % 4 < 4"), + }; + + *slot = value; + } + + let group = load(raw); + + let is_full = group.match_full(); + + let full: Vec<_> = is_full.into_iter().collect(); + + assert_eq!(Group::WIDTH / 2, full.len(), "{full:?}"); + assert!(full.iter().all(|i| *i % 2 != 0), "{full:?}"); + } + + #[test] + fn test_convert_special_to_empty_and_full_to_deleted() { + use ::alloc::vec::Vec; + + let mut raw = RawGroup::default(); + + for (i, slot) in raw.iter_mut().enumerate() { + let value = match i % 4 { + 0 => EMPTY, + 1 => 2, + 2 => DELETED, + 3 => 255, + _ => unreachable!("i % 4 < 4"), + }; + + *slot = value; + } + + let group = load(raw); + + let converted = group.convert_special_to_empty_and_full_to_deleted(); + + dbg!(store(converted)); + + let empty: Vec<_> = converted.match_empty().into_iter().collect(); + + assert_eq!(Group::WIDTH / 2, empty.len(), "{empty:?}"); + assert!(empty.iter().all(|i| *i % 2 == 0), "{empty:?}"); + + let deleted: Vec<_> = converted.match_byte(DELETED).into_iter().collect(); + + assert_eq!(Group::WIDTH / 2, deleted.len(), "{deleted:?}"); + assert!(deleted.iter().all(|i| *i % 2 != 0), "{deleted:?}"); + } +} + #[cfg(test)] mod test_map { use super::*; diff --git a/src/raw/sse2.rs b/src/raw/sse2.rs index 956ba5d26..acee95c4d 100644 --- a/src/raw/sse2.rs +++ b/src/raw/sse2.rs @@ -1,5 +1,5 @@ use super::bitmask::BitMask; -use super::EMPTY; +use super::{DELETED, EMPTY}; use core::mem; use core::num::NonZeroU16; @@ -102,6 +102,9 @@ impl Group { /// `EMPTY` or `DELETED`. #[inline] pub(crate) fn match_empty_or_deleted(self) -> BitMask { + debug_assert_eq!(127, EMPTY); + debug_assert_eq!(126, DELETED); + #[allow( // byte: i32 as u16 // note: _mm_movemask_epi8 returns a 16-bit mask in a i32, the @@ -110,15 +113,30 @@ impl Group { clippy::cast_possible_truncation )] unsafe { - // A byte is EMPTY or DELETED iff the high bit is set - BitMask(x86::_mm_movemask_epi8(self.0) as u16) + // A byte is EMPTY or DELETED iff it is greater than or equal to DELETED. + let is_special = x86::_mm_cmpgt_epi8(self.0, x86::_mm_set1_epi8(DELETED as i8 - 1)); + BitMask(x86::_mm_movemask_epi8(is_special) as u16) } } /// Returns a `BitMask` indicating all bytes in the group which are full. #[inline] pub(crate) fn match_full(&self) -> BitMask { - self.match_empty_or_deleted().invert() + debug_assert_eq!(127, EMPTY); + debug_assert_eq!(126, DELETED); + + #[allow( + // byte: i32 as u16 + // note: _mm_movemask_epi8 returns a 16-bit mask in a i32, the + // upper 16-bits of the i32 are zeroed: + clippy::cast_sign_loss, + clippy::cast_possible_truncation + )] + unsafe { + // A byte is full iff it is strictly less than DELETED. + let is_full = x86::_mm_cmplt_epi8(self.0, x86::_mm_set1_epi8(DELETED as i8)); + BitMask(x86::_mm_movemask_epi8(is_full) as u16) + } } /// Performs the following transformation on all bytes in the group: @@ -127,22 +145,20 @@ impl Group { /// - `FULL => DELETED` #[inline] pub(crate) fn convert_special_to_empty_and_full_to_deleted(self) -> Self { - // Map high_bit = 1 (EMPTY or DELETED) to 1111_1111 - // and high_bit = 0 (FULL) to 1000_0000 - // - // Here's this logic expanded to concrete values: - // let special = 0 > byte = 1111_1111 (true) or 0000_0000 (false) - // 1111_1111 | 1000_0000 = 1111_1111 - // 0000_0000 | 1000_0000 = 1000_0000 - #[allow( - clippy::cast_possible_wrap, // byte: 0x80_u8 as i8 - )] + debug_assert_eq!(127, EMPTY); + debug_assert_eq!(126, DELETED); + + #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] unsafe { - let zero = x86::_mm_setzero_si128(); - let special = x86::_mm_cmpgt_epi8(zero, self.0); + let empty = x86::_mm_set1_epi8(EMPTY as i8); + let deleted = x86::_mm_set1_epi8(DELETED as i8); + + let is_full = x86::_mm_cmplt_epi8(self.0, deleted); + let is_special = x86::_mm_cmpeq_epi8(is_full, x86::_mm_set1_epi8(0)); + Group(x86::_mm_or_si128( - special, - x86::_mm_set1_epi8(0x80_u8 as i8), + x86::_mm_and_si128(is_full, deleted), + x86::_mm_and_si128(is_special, empty), )) } }