Skip to content

Commit

Permalink
Added first draft of gaps encoding and extended testing
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaCappelletti94 committed Aug 25, 2024
1 parent 0acc3b7 commit 4992d91
Show file tree
Hide file tree
Showing 14 changed files with 5,013 additions and 160 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ twox-hash = {version="1.6.3", default-features = false}
mem_dbg = {version = "0.2.4", optional = true}
hyperloglog-derive = { path = "hyperloglog-derive" }


[build-dependencies]
serde_json = "1.0"

Expand Down
6 changes: 5 additions & 1 deletion optimal-gap-codes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ serde = { version = "1.0.208", features = ["derive"] }
test_utils = {path="../test_utils"}
twox-hash = "1.6.3"
wyhash = "0.5.0"
dsi-bitstream = {git="https://github.com/LucaCappelletti94/dsi-bitstream-rs.git", branch="main"}
dsi-bitstream = {git="https://github.com/LucaCappelletti94/dsi-bitstream-rs.git", branch="main"}
syn = "2.0"
quote = "1.0"
proc-macro2 = "1.0"
prettyplease = "0.2"
731 changes: 621 additions & 110 deletions optimal-gap-codes/optimal-gap-codes.csv

Large diffs are not rendered by default.

242 changes: 205 additions & 37 deletions optimal-gap-codes/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,38 @@
#![deny(unsafe_code)]
#![deny(unused_macro_rules)]
#![deny(missing_docs)]
extern crate prettyplease;
extern crate proc_macro2;
extern crate quote;
extern crate syn;

use prettyplease::unparse;
use proc_macro2::TokenStream;
use quote::quote;
use syn::{File, Ident};

use dsi_bitstream::prelude::{Code, CodesStats};
use hyperloglog_rs::composite_hash::{switch::SwitchHash, CompositeHash};
use hyperloglog_rs::composite_hash::{current::CurrentHash, switch::SwitchHash, CompositeHash};
use hyperloglog_rs::prelude::*;
use indicatif::MultiProgress;
use indicatif::{ParallelProgressIterator, ProgressBar, ProgressStyle};
use rayon::prelude::*;
use serde::Deserialize;
use serde::Serialize;
use serde::{Deserialize, Serializer};
use std::collections::HashMap;
use test_utils::prelude::{append_csv, read_csv, write_csv};
use twox_hash::XxHash64;

fn float_formatter<S>(value: &f64, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&format!("{value:.4}"))
}

type CS = CodesStats<50, 50, 50, 50, 50>;

#[derive(Debug, Serialize, Deserialize, PartialEq, PartialOrd)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
/// Report of the gap between subsequent hashes in the Listhash variant of HyperLogLog.
struct GapReport {
/// The precision exponent of the HyperLogLog, determining
Expand All @@ -42,9 +58,92 @@ struct GapReport {
/// The uncompressed space usage if no code was used.
uncompressed_space_usage: u64,
/// The rate of the optimal code.
#[serde(serialize_with = "float_formatter")]
rate: f64,
/// Mean encoded gap size in bits.
#[serde(serialize_with = "float_formatter")]
mean_gap_size: f64,
/// The number of hashes that can fit without the optimal code.
number_of_hashes: u64,
/// The number of hashes that can fit with the optimal code.
number_of_hashes_with_code: u64,
/// Number of extra hashes that can fit with the optimal code and not
/// without it.
extra_hashes: u64,
}

impl GapReport {
fn as_prefix_free_code_impl(&self) -> TokenStream {
let precision = Ident::new(
&format!("Precision{}", self.precision),
proc_macro2::Span::call_site(),
);
let bits = Ident::new(
&format!("Bits{}", self.bit_size),
proc_macro2::Span::call_site(),
);
let code = Ident::new(
&self.code.split("(").next().unwrap(),
proc_macro2::Span::call_site(),
);
let code = if self.code.contains("(") {
let number = self
.code
.split("(")
.last()
.unwrap()
.split(")")
.next()
.unwrap();
let number_usize = number.parse::<usize>().unwrap();

quote! { #code<#number_usize> }
} else {
quote! { #code }
};

let hash_size = self.hash_size;
let composite_hash = Ident::new(&self.composite_hash, proc_macro2::Span::call_site());

let precision_flag = format!("precision_{}", self.precision);

quote! {
#[cfg(feature = #precision_flag)]
impl super::PrefixFreeCode<#hash_size> for crate::composite_hash::#composite_hash<crate::precisions::#precision, crate::bits::#bits> {
type Code = super::prefix_free_codes::#code;
}
}
}

fn as_test_only_prefix_free_code_impl(
precision: u8,
bit_size: u8,
hash_size: u8,
composite_hash: &str,
) -> TokenStream {

let bits = Ident::new(
&format!("Bits{}", bit_size),
proc_macro2::Span::call_site(),
);

let hash_size = hash_size;
let composite_hash = Ident::new(&composite_hash, proc_macro2::Span::call_site());

let precision_flag = format!("precision_{}", precision);
let precision = Ident::new(
&format!("Precision{}", precision),
proc_macro2::Span::call_site(),
);

quote! {
#[cfg(feature = #precision_flag)]
#[cfg(test)]
impl super::PrefixFreeCode<#hash_size> for crate::composite_hash::#composite_hash<crate::precisions::#precision, crate::bits::#bits> {
type Code = super::prefix_free_codes::NoPrefixCode<#hash_size>;
}
}
}
}

/// Normalized the name of a hasher type.
Expand Down Expand Up @@ -86,7 +185,7 @@ fn optimal_gap_codes<
}
}

let iterations = 50_000;
let iterations = 100_000;
let hll = Hybrid::<PlusPlus<P, B, <P as ArrayRegister<B>>::Packed, H>, CH>::default();

let progress_bar = multiprogress.add(ProgressBar::new(iterations as u64));
Expand Down Expand Up @@ -117,14 +216,12 @@ fn optimal_gap_codes<
let hash_size = hll.hash_bytes() * 8;
let mut last_hash: Option<u64> = None;
for hash in hll.hashes().unwrap() {
if let Some(last_hash) = last_hash {
assert!(last_hash >= hash);
let entry = gap_report
.entry(hash_size)
.or_insert_with(|| (CS::default(), 0));
entry.0.update(last_hash - hash);
entry.1 += 1;
}
let gap = last_hash.map(|last_hash| last_hash - hash).unwrap_or(hash);
let entry = gap_report
.entry(hash_size)
.or_insert_with(|| (CS::default(), 0));
entry.0.update(gap);
entry.1 += 1;
last_hash = Some(hash);
}
}
Expand All @@ -133,14 +230,12 @@ fn optimal_gap_codes<
let hash_size = hll.hash_bytes() * 8;
let mut last_hash: Option<u64> = None;
for hash in hll.hashes().unwrap() {
if let Some(last_hash) = last_hash {
assert!(last_hash >= hash);
let entry = gap_report
.entry(hash_size)
.or_insert_with(|| (CS::default(), 0));
entry.0.update(last_hash - hash);
entry.1 += 1;
}
let gap = last_hash.map(|last_hash| last_hash - hash).unwrap_or(hash);
let entry = gap_report
.entry(hash_size)
.or_insert_with(|| (CS::default(), 0));
entry.0.update(gap);
entry.1 += 1;
last_hash = Some(hash);
}
break;
Expand All @@ -152,9 +247,7 @@ fn optimal_gap_codes<
|| HashMap::new(),
|mut acc, report| {
for (hash_size, (gap_report, total)) in report {
let hash_size_report = acc
.entry(hash_size)
.or_insert_with(|| (CS::default(), 0));
let hash_size_report = acc.entry(hash_size).or_insert_with(|| (CS::default(), 0));
hash_size_report.0 += gap_report;
hash_size_report.1 += total;
}
Expand All @@ -168,9 +261,13 @@ fn optimal_gap_codes<
gaps.iter().map(|(hash_size, (gap_report, total))| {
let (code, space_usage): (Code, u64) = gap_report.best_code();

let uncompressed_space_usage = u64::from(*hash_size) * (total + 1);
let uncompressed_space_usage = u64::from(*hash_size) * *total as u64;
let rate = space_usage as f64 / uncompressed_space_usage as f64;
let mean_gap_size = space_usage as f64 / *total as f64;
let number_of_hashes = *total / iterations;
let number_of_hashes_with_code =
(uncompressed_space_usage as f64 / mean_gap_size / iterations as f64) as u64;
let extra_hashes = number_of_hashes_with_code.saturating_sub(number_of_hashes);

GapReport {
precision: P::EXPONENT,
Expand All @@ -182,7 +279,10 @@ fn optimal_gap_codes<
space_usage,
uncompressed_space_usage,
rate,
mean_gap_size
mean_gap_size,
number_of_hashes,
number_of_hashes_with_code,
extra_hashes,
}
}),
path,
Expand All @@ -193,25 +293,25 @@ fn optimal_gap_codes<
/// bit size and hasher types.
macro_rules! generate_optimal_gap_codes {
($multiprogress:ident, $precision:ty, $bit_size:ty, $($hasher:ty),*) => {
// let progress_bar = $multiprogress.add(ProgressBar::new(1 as u64));
let progress_bar = $multiprogress.add(ProgressBar::new(2 as u64));

// progress_bar.set_style(
// ProgressStyle::default_bar()
// .template("[{elapsed_precise} | {eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}")
// .unwrap()
// .progress_chars("##-"),
// );
progress_bar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise} | {eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}")
.unwrap()
.progress_chars("##-"),
);

// progress_bar.tick();
progress_bar.tick();

$(
// optimal_gap_codes::<$precision, $bit_size, $hasher, CurrentHash<$precision, $bit_size>>($multiprogress);
// progress_bar.inc(1);
optimal_gap_codes::<$precision, $bit_size, $hasher, CurrentHash<$precision, $bit_size>>($multiprogress);
progress_bar.inc(1);
optimal_gap_codes::<$precision, $bit_size, $hasher, SwitchHash<$precision, $bit_size>>($multiprogress);
// progress_bar.inc(1);
progress_bar.inc(1);
)*

// progress_bar.finish_and_clear();
progress_bar.finish_and_clear();
};
}

Expand Down Expand Up @@ -290,4 +390,72 @@ fn main() {
reports.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));

write_csv(reports.iter(), "optimal-gap-codes.csv");

// Next, we generate the implementation of the PrefixFreeCode trait for the optimal codes.
// Of all reports, we keep only the first one we encounter for each combination of precision,
// bit size, hash size and composite hash.
let reports = reports
.into_iter()
.filter(|report| {
// If the report shows that the optimal code achieves less than 1 extra hash, we do not
// generate the implementation.
report.extra_hashes > 0
})
.fold(HashMap::new(), |mut acc, report| {
let key = (
report.precision,
report.bit_size,
report.hash_size,
report.composite_hash.clone(),
);
acc.entry(key).or_insert(report);
acc
});

let valid_impls = reports
.iter()
.map(|(_, report)| report.as_prefix_free_code_impl());


let test_impls = (4..=18)
.flat_map(|precision| {
[4, 5, 6].map(|bits| (precision, bits))
})
.flat_map(|(precision, bits)|{
[8, 16, 24, 32].map(|hash_size|{
(precision, bits, hash_size)
})
})
.flat_map(|(precision, bits, hash_size)|{
["CurrentHash", "SwitchHash"].map(move |composite_hash| {
(precision, bits, hash_size, composite_hash)
})
}).filter(|(precision, bits, hash_size, composite_hash)| {
!reports.contains_key(&(*precision, *bits, *hash_size, composite_hash.to_string()))
}).map(|(precision, bits, hash_size, composite_hash)| {
GapReport::as_test_only_prefix_free_code_impl(precision, bits, hash_size, composite_hash)
});

let output = quote! {
#(#valid_impls)*

#(#test_impls)*
};

// We write out the output token stream to '../src/composite_hash/gaps/optimal_codes.rs'.
let output_path = "../src/composite_hash/gaps/optimal_codes.rs";

// Convert the generated TokenStream to a string
let code_string = output.to_string();

// Parse the generated code string into a syn::Item
let syntax_tree: File = syn::parse_str(&code_string).unwrap();

// Use prettyplease to format the syntax tree
let formatted_code = unparse(&syntax_tree);

// Write the formatted code to the output file
std::fs::write(output_path, formatted_code).unwrap();

println!("Generated optimal codes in '{}'", output_path);
}
Loading

0 comments on commit 4992d91

Please sign in to comment.