Skip to content

Commit

Permalink
Finalized massive lineup of models for cardinality benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaCappelletti94 committed Aug 21, 2024
1 parent 4bfc3c2 commit 27a3c99
Show file tree
Hide file tree
Showing 13 changed files with 228 additions and 119 deletions.
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,7 @@ harness = false
[[bench]]
name = "hybrid"
harness = false

[[bench]]
name = "array"
harness = false
52 changes: 52 additions & 0 deletions benches/array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
//! Benchmark for the methods of the array data structure.
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use hyperloglog_rs::prelude::*;

const PRECISION: usize = 15;
const REGISTER_SIZE: usize = 6;
const NUMBER_OF_REGISTERS: usize = 1 << PRECISION;
const NUMBER_OF_REGISTERS_IN_U64: usize = 64 / REGISTER_SIZE;
const PADDED_SIZE: usize = ceil(NUMBER_OF_REGISTERS, NUMBER_OF_REGISTERS_IN_U64);
const PACKED_SIZE: usize = ceil(NUMBER_OF_REGISTERS * REGISTER_SIZE, 64);

fn bench_array(c: &mut Criterion) {
let mut group = c.benchmark_group("array");

group.bench_function("array_insert", |b| {
b.iter(|| {
let mut left = 0;
let mut right = 0;
let mut array: Array<PADDED_SIZE, false, Bits6> = Array::default();
for i in 0..NUMBER_OF_REGISTERS {
for value in 0..64 {
let (l, r) = array.set_apply(black_box(i), black_box(|x: u8| x.max(value)));
left ^= l;
right ^= r;
}
}
(left, right)
});
});

group.bench_function("packed_insert", |b| {
b.iter(|| {
let mut left = 0;
let mut right = 0;
let mut packed: Array<PACKED_SIZE, true, Bits6> = Array::default();
for i in 0..NUMBER_OF_REGISTERS {
for value in 0..64 {
let (l, r) = packed.set_apply(black_box(i), black_box(|x: u8| x.max(value)));
left ^= l;
right ^= r;
}
}
(left, right)
});
});

group.finish();
}

criterion_group!(benches, bench_array);

criterion_main!(benches);
4 changes: 3 additions & 1 deletion hyperloglog-derive/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ pub fn derive_variable_word(input: TokenStream) -> TokenStream {
#[inline]
#[must_use]
fn name(&self) -> String {
"#name".to_owned()
stringify!(#name).to_string()
}
}
};
Expand Down Expand Up @@ -812,3 +812,5 @@ pub fn test_estimator(_attr: TokenStream, item: TokenStream) -> TokenStream {
// Convert the expanded code into a token stream
TokenStream::from(expanded)
}


14 changes: 7 additions & 7 deletions src/mle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,16 +294,15 @@ fn mle_union_cardinality<

let yjoint_right_zleft = y_register[2] * z_register[0] * y_register[1];
let yjoint_left_zright = y_register[2] * z_register[1] * y_register[0];
let yjointleft = y_register[2] * y_register[0];
let yjointright = y_register[2] * y_register[1];
let zj_plus_yjoint_zright = z_register[2] + y_register[2] * z_register[1];
let zj_plus_yjoint_zlr = z_register[2] + y_register[2] * z_register[0] * z_register[1];
let reciprocal_zj_plus_yjoint_zlr = f64::ONE / zj_plus_yjoint_zlr;

let left_reciprocal = left_smaller_k
* (y_register[2] * y_register[0] / (z_register[2] + y_register[2] * z_register[0]) - f64::ONE);
let right_reciprocal =
right_smaller_k * (yjointright / zj_plus_yjoint_zright - f64::ONE);
* (y_register[2] * y_register[0] / (z_register[2] + y_register[2] * z_register[0])
- f64::ONE);
let right_reciprocal = right_smaller_k
* (y_register[2] * y_register[1] / zj_plus_yjoint_zright - f64::ONE);

let delta = [
left_reciprocal
Expand All @@ -315,7 +314,8 @@ fn mle_union_cardinality<
left_reciprocal
+ right_reciprocal
+ joint_k
* ((yjointleft + yjoint_right_zleft) * reciprocal_zj_plus_yjoint_zlr
* ((y_register[2] * y_register[0] + yjoint_right_zleft)
* reciprocal_zj_plus_yjoint_zlr
- f64::ONE),
];

Expand Down Expand Up @@ -472,7 +472,7 @@ impl<const N: usize> Default for Adam<N> {
first_moments: [0.0; N],
second_moments: [0.0; N],
time: 0,
learning_rate: 0.01,
learning_rate: 0.1,
first_order_decay_factor: 0.9,
second_order_decay_factor: 0.999,
}
Expand Down
9 changes: 7 additions & 2 deletions src/registers/packed_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ impl<const N: usize, const PACKED: bool, V: VariableWord> Array<N, PACKED, V> {
}

#[inline]
#[allow(unsafe_code)]
/// Applies a function to the value at the given index.
///
/// # Arguments
Expand All @@ -720,14 +721,18 @@ impl<const N: usize, const PACKED: bool, V: VariableWord> Array<N, PACKED, V> {
///
/// # Returns
/// The previous value at the given index and the new value.
fn set_apply<F>(&mut self, index: usize, ops: F) -> (V::Word, V::Word)
///
/// # Safety
/// This method accesses values in the underlying array without checking whether the index is valid,
/// as it is guaranteed to be valid by the split_index method.
pub fn set_apply<F>(&mut self, index: usize, ops: F) -> (V::Word, V::Word)
where
F: Fn(V::Word) -> V::Word,
{
let (word_index, relative_value_offset) = split_index::<PACKED, V>(index);

if Self::is_bridge_offset(relative_value_offset) {
let (low, high) = self.words.split_at_mut(word_index + 1);
let (low, high) = unsafe {self.words.split_at_mut_unchecked(word_index + 1)};
let low = &mut low[word_index];
let high = &mut high[0];
let value = extract_bridge_value_from_word::<V>(*low, *high, relative_value_offset);
Expand Down
2 changes: 1 addition & 1 deletion src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ impl Named for u64 {
/// # Arguments
/// * `numerator` - The numerator of the division.
/// * `denominator` - The denominator of the division.
pub(crate) const fn ceil(numerator: usize, denominator: usize) -> usize {
pub const fn ceil(numerator: usize, denominator: usize) -> usize {
(numerator + denominator - 1) / denominator
}

Expand Down
2 changes: 1 addition & 1 deletion statistical_comparisons/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ cardinality-estimator = {git = "https://github.com/LucaCappelletti94/cardinality
rust-hyperloglog = {git = "https://github.com/LucaCappelletti94/rust-hyperloglog.git", branch = "updated_siphasher", package = "hyperloglog", features = ["mem_dbg"]}
sourmash = {git="https://github.com/LucaCappelletti94/sourmash.git", features = ["mem_dbg"], branch = "latest_merged"}
hypertwobits = {git="https://github.com/LucaCappelletti94/hypertwobits.git", features = ["mem_dbg"], branch="main"}
simple_hll = {git="https://github.com/LucaCappelletti94/simple_hll.git", features = ["mem_dbg"], branch="main"}
simple_hll = {git="https://github.com/LucaCappelletti94/simple_hll.git", features = ["mem_dbg"], branch="hasher"}
stattest = {git = "https://github.com/LucaCappelletti94/stattest", branch = "faster_wilcoxon"}
csv = "1.3.0"
wyhash = {git="https://github.com/LucaCappelletti94/wyhash-rs", branch="merged", features=["mem_dbg"]}
Expand Down
2 changes: 1 addition & 1 deletion statistical_comparisons/macro_test_utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
syn = "2.0"
syn = {version="2.0", features=["full"]}
quote = "1.0"
proc-macro2 = "1.0"

Expand Down
85 changes: 84 additions & 1 deletion statistical_comparisons/macro_test_utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use proc_macro::TokenStream;
use quote::quote;
use syn::{parse_macro_input, Data, DeriveInput, Fields};
use syn::{parse_macro_input, Data, DeriveInput, Fields, Ident, ItemFn};

#[proc_macro_derive(Named)]
pub fn my_trait_derive(input: TokenStream) -> TokenStream {
Expand Down Expand Up @@ -222,3 +222,86 @@ pub fn transparent_mem_size_derive(input: TokenStream) -> TokenStream {

TokenStream::from(expanded)
}

#[proc_macro_attribute]
pub fn cardinality_benchmark(_attr: TokenStream, item: TokenStream) -> TokenStream {
// Parse the input token stream (the function we're deriving for)
let input = parse_macro_input!(item as ItemFn);

// Extract the function name
let fn_name = &input.sig.ident;

// Define a list of generics we want to cover
let precisions = (4..=18)
.map(|precision| {
(
precision,
Ident::new(&format!("Precision{}", precision), fn_name.span()),
)
})
.collect::<Vec<(usize, _)>>();
let bits = (4..=6)
.map(|bits| (bits, Ident::new(&format!("Bits{}", bits), fn_name.span())))
.collect::<Vec<(usize, _)>>();
let hashers = vec![
Ident::new("XxHash64", fn_name.span()),
Ident::new("WyHash", fn_name.span()),
Ident::new("AHasher", fn_name.span()),
Ident::new("XxH3", fn_name.span()),
];
let words = vec![
(8, Ident::new("u8", fn_name.span())),
(16, Ident::new("u16", fn_name.span())),
(24, Ident::new("u24", fn_name.span())),
(32, Ident::new("u32", fn_name.span())),
(48, Ident::new("u48", fn_name.span())),
(56, Ident::new("u56", fn_name.span())),
(64, Ident::new("u64", fn_name.span())),
];

// Generate the test functions
let benchmarks = hashers.into_iter().flat_map(move |hasher| {
let precisions = precisions.clone();
let bits = bits.clone();
let words = words.clone();
precisions.into_iter().map(move |(exponent, precision)| {
let bits = bits.clone();
let words = words.clone();
let hasher = hasher.clone();
let h2b_calls = quote! {
HyperTwoVariants::<#hasher>::prepare_cardinality_reports();
};
let hll_calls = bits
.into_iter()
.flat_map(move |(bit_num, bit)| {
let words = words.clone();
let precision = precision.clone();
let hasher = hasher.clone();
words.into_iter().map(move |(word_size, word)| {
if exponent + bit_num > word_size {
return quote! {};
}
quote! {
HLLVariants::<#exponent, #precision, #hasher, #bit_num, #bit, #word>::prepare_cardinality_reports();
}
})
});
quote! {
#h2b_calls
#(#hll_calls)*
}
})
});

// Generate the final token stream
let expanded = quote! {
#input

fn cardinality_benchmarks() {
#(#benchmarks)*
}
};

// Convert the expanded code into a token stream
TokenStream::from(expanded)
}
64 changes: 18 additions & 46 deletions statistical_comparisons/src/enumerations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,56 +51,28 @@ pub enum HyperTwoVariants<H: HasherBuilderAssociated> {
#[derive(Clone, Named, ExtendableApproximatedSet, Estimator, TransparentMemSize, EnumIter)]
/// Enumerations will all `HyperLogLog` variants we
/// take into consideration for the benchmarks.
pub enum HLLVariants<const EXPONENT: usize, P: Precision, H: HasherBuilderAssociated>
pub enum HLLVariants<const EXPONENT: usize, P: Precision, H: HasherBuilderAssociated, const BITS: usize, B, CH>
where
P: AllArrays + Named,
<P as ArrayRegister<Bits4>>::Array: VariableWords<u32>,
<P as ArrayRegister<Bits5>>::Array: VariableWords<u32>,
<P as ArrayRegister<Bits6>>::Array: VariableWords<u32>,
<P as ArrayRegister<Bits4>>::Packed: VariableWords<u32>,
<P as ArrayRegister<Bits5>>::Packed: VariableWords<u32>,
<P as ArrayRegister<Bits6>>::Packed: VariableWords<u32>,
P: Named + ArrayRegister<B>,
B: Named + Bits,
CH: Named + CompositeHash<P, B>,
<P as ArrayRegister<B>>::Array: VariableWords<CH>,
<P as ArrayRegister<B>>::Packed: VariableWords<CH>,
{
TabacHyperLogLogPlus(TabacHLLPlusPlus<P, H>),
TabacHyperLogLogPF(TabacHLL<P, H>),
SAHyperLogLog(AlecHLL<P>),
RustHyperLogLog(RustHLL<P>),
CE4(CloudFlareHLL<EXPONENT, 4, H>),
CE5(CloudFlareHLL<EXPONENT, 5, H>),
CE6(CloudFlareHLL<EXPONENT, 6, H>),
SimpleHLL(SimpleHLL<EXPONENT>),
PP4ArrayXxhasher(PlusPlus<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>),
PP5ArrayXxhasher(PlusPlus<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>),
PP6ArrayXxhasher(PlusPlus<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>),
PP4PackedXxhasher(PlusPlus<P, Bits4, <P as ArrayRegister<Bits4>>::Packed, H>),
PP5PackedXxhasher(PlusPlus<P, Bits5, <P as ArrayRegister<Bits5>>::Packed, H>),
PP6PackedXxhasher(PlusPlus<P, Bits6, <P as ArrayRegister<Bits6>>::Packed, H>),
LLB4ArrayXxhasher(LogLogBeta<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>),
LLB5ArrayXxhasher(LogLogBeta<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>),
LLB6ArrayXxhasher(LogLogBeta<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>),
LLB5PackedXxhasher(LogLogBeta<P, Bits5, <P as ArrayRegister<Bits5>>::Packed, H>),
LLB6PackedXxhasher(LogLogBeta<P, Bits6, <P as ArrayRegister<Bits6>>::Packed, H>),
MLEPP4Xxhasher(MLE<PlusPlus<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>>),
MLEPP5Xxhasher(MLE<PlusPlus<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>>),
MLEPP6Xxhasher(MLE<PlusPlus<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>>),
MLELLB4Xxhasher(MLE<LogLogBeta<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>>),
MLELLB5Xxhasher(MLE<LogLogBeta<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>>),
MLELLB6Xxhasher(MLE<LogLogBeta<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>>),
HybridPP4ArrayXxhasher(Hybrid<PlusPlus<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>>),
HybridPP5ArrayXxhasher(Hybrid<PlusPlus<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>>),
HybridPP6ArrayXxhasher(Hybrid<PlusPlus<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>>),
HybridPP4PackedXxhasher(Hybrid<PlusPlus<P, Bits4, <P as ArrayRegister<Bits4>>::Packed, H>>),
HybridPP5PackedXxhasher(Hybrid<PlusPlus<P, Bits5, <P as ArrayRegister<Bits5>>::Packed, H>>),
HybridPP6PackedXxhasher(Hybrid<PlusPlus<P, Bits6, <P as ArrayRegister<Bits6>>::Packed, H>>),
HybridLLB4ArrayXxhasher(Hybrid<LogLogBeta<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>>),
HybridLLB5ArrayXxhasher(Hybrid<LogLogBeta<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>>),
HybridLLB6ArrayXxhasher(Hybrid<LogLogBeta<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>>),
HybridLLB5PackedXxhasher(Hybrid<LogLogBeta<P, Bits5, <P as ArrayRegister<Bits5>>::Packed, H>>),
HybridLLB6PackedXxhasher(Hybrid<LogLogBeta<P, Bits6, <P as ArrayRegister<Bits6>>::Packed, H>>),
HybridMLEPP4Xxhasher(Hybrid<MLE<PlusPlus<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>>>),
HybridMLEPP5Xxhasher(Hybrid<MLE<PlusPlus<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>>>),
HybridMLEPP6Xxhasher(Hybrid<MLE<PlusPlus<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>>>),
HybridMLELLB4Xxhasher(Hybrid<MLE<LogLogBeta<P, Bits4, <P as ArrayRegister<Bits4>>::Array, H>>>),
HybridMLELLB5Xxhasher(Hybrid<MLE<LogLogBeta<P, Bits5, <P as ArrayRegister<Bits5>>::Array, H>>>),
HybridMLELLB6Xxhasher(Hybrid<MLE<LogLogBeta<P, Bits6, <P as ArrayRegister<Bits6>>::Array, H>>>),
CE4(CloudFlareHLL<EXPONENT, BITS, H>),
SimpleHLL(SimpleHLL<H, EXPONENT>),
PP4ArrayXxhasher(PlusPlus<P, B, <P as ArrayRegister<B>>::Array, H>),
PP4PackedXxhasher(PlusPlus<P, B, <P as ArrayRegister<B>>::Packed, H>),
LLB4ArrayXxhasher(LogLogBeta<P, B, <P as ArrayRegister<B>>::Array, H>),
MLEPP4Xxhasher(MLE<PlusPlus<P, B, <P as ArrayRegister<B>>::Array, H>>),
MLELLB4Xxhasher(MLE<LogLogBeta<P, B, <P as ArrayRegister<B>>::Array, H>>),
HybridPP4ArrayXxhasher(Hybrid<PlusPlus<P, B, <P as ArrayRegister<B>>::Array, H>, CH>),
HybridPP4PackedXxhasher(Hybrid<PlusPlus<P, B, <P as ArrayRegister<B>>::Packed, H>, CH>),
HybridLLB4ArrayXxhasher(Hybrid<LogLogBeta<P, B, <P as ArrayRegister<B>>::Array, H>, CH>),
HybridMLEPP4Xxhasher(Hybrid<MLE<PlusPlus<P, B, <P as ArrayRegister<B>>::Array, H>>, CH>),
HybridMLELLB4Xxhasher(Hybrid<MLE<LogLogBeta<P, B, <P as ArrayRegister<B>>::Array, H>>, CH>),
}
26 changes: 13 additions & 13 deletions statistical_comparisons/src/estimation_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ pub(crate) fn cardinality_test<
multi_progress: Option<&indicatif::MultiProgress>,
) -> Vec<PerformanceReport> {
let number_of_vectors = 1_000_u64;
let minimum_sample_interval = 5_u64;
let maximum_sample_interval = 20_000_u64;
let random_state = splitmix64(9_516_748_163_234_878_233_u64);
let number_of_elements = 1_000_000;
let sample_interval = number_of_elements / 1_000;
let sequence_random_state = splitmix64(9_516_748_163_234_878_233_u64);
let sample_index_random_state = splitmix64(234_878_239_9_516_748_163_u64);

let estimator_name = estimator.name();

Expand All @@ -73,23 +74,22 @@ pub(crate) fn cardinality_test<
progress_bar
})
.flat_map(|thread_number| {
let mut random_state =
splitmix64(splitmix64(random_state.wrapping_mul(thread_number + 1)));
let sequence_random_state =
splitmix64(splitmix64(sequence_random_state.wrapping_mul(thread_number + 1)));
let mut sample_index_random_state =
splitmix64(splitmix64(sample_index_random_state.wrapping_mul(thread_number + 1)));
let mut performance_reports = Vec::new();
let mut estimator = estimator.clone();

let mut current_sample_rate = minimum_sample_interval;
let mut next_sample_index = sample_interval;

for (i, element) in
iter_random_values::<u64>(2_000_000, None, Some(random_state)).enumerate()
iter_random_values::<u64>(number_of_elements, None, Some(sequence_random_state)).enumerate()
{
estimator.insert(&element);

if u64::try_from(i).unwrap() % current_sample_rate == 0 {
if current_sample_rate < maximum_sample_interval {
random_state = splitmix64(random_state);
current_sample_rate += random_state % current_sample_rate;
}
if next_sample_index == i as u64{
sample_index_random_state = splitmix64(sample_index_random_state);
next_sample_index += sample_index_random_state % sample_interval;

performance_reports.push(PerformanceReport {
prediction: estimator.estimate_cardinality(),
Expand Down
Loading

0 comments on commit 27a3c99

Please sign in to comment.