Skip to content

Commit

Permalink
Improved theoretical estimations
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaCappelletti94 committed Sep 2, 2024
1 parent 86a9bab commit d2806f0
Show file tree
Hide file tree
Showing 9 changed files with 2,075 additions and 196,556 deletions.
Binary file modified measure_variant/comparison.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
99,330 changes: 1,025 additions & 98,305 deletions measure_variant/latest_report.csv

Large diffs are not rendered by default.

99,046 changes: 883 additions & 98,163 deletions measure_variant/reference_report.csv

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions measure_variant/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,22 @@ use test_utils::prelude::{compare_features, read_csv, write_csv};

type HLL1 = Hybrid<
PlusPlus<
Precision17,
Precision10,
Bits6,
<Precision17 as ArrayRegister<Bits6>>::Packed,
<Precision10 as ArrayRegister<Bits6>>::Packed,
twox_hash::XxHash64,
>,
SwitchHash<Precision17, Bits6>,
SwitchHash<Precision10, Bits6>,
>;

type HLL2 = Hybrid<
PlusPlus<
Precision17,
Precision10,
Bits6,
<Precision17 as ArrayRegister<Bits6>>::Packed,
<Precision10 as ArrayRegister<Bits6>>::Packed,
twox_hash::XxHash64,
>,
GapHash<SwitchHash<Precision17, Bits6>>,
GapHash<SwitchHash<Precision10, Bits6>>,
>;

const ITERATIONS: usize = 100;
Expand All @@ -43,7 +43,7 @@ struct Report {
fn main() {
let random_state = 7_536_558_723_694_876_u64;

let max_hashes = (1 << 17) * 6 / 8;
let max_hashes = (1 << 10) * 6 / 6;
let max = max_hashes as u64;

let progress_bar = ProgressBar::new(ITERATIONS as u64);
Expand Down
Binary file modified scripts/absolute_error_vs_cardinality.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
236 changes: 157 additions & 79 deletions scripts/compare_hybrid_hash.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
"""Script python to plot and compare the performance of the hybrid hashes with HLL"""

from typing import Optional, List, Dict, Tuple
import decimal
from decimal import Decimal
import matplotlib.pyplot as plt
import math
from typing import Optional
from tqdm import tqdm

import decimal
from decimal import Decimal

# Set precision high enough to handle large values
decimal.getcontext().prec = 1000


def compute_hll_error_rate(precision_bits: int) -> Decimal:
"""Computes the error rate of HyperLogLog for a given precision.
The error rate is computed as 1.04 / sqrt(2^precision_bits).
Parameters:
precision_bits (int): Number of bits used for the precision of HyperLogLog.
"""
# Number of registers m = 2^precision_bits
m = Decimal(2**precision_bits)

Expand All @@ -23,105 +29,177 @@ def compute_hll_error_rate(precision_bits: int) -> Decimal:


def compute_expected_collisions(
m_bits: int, # Total bits in the hash
n_uniform_bits: int, # Number of strictly uniform bits (p)
b_geometric_bits: int, # Geometric bits (b)
uniform_bits: int, # Total bits in the hash
geometric_bits: int, # Geometric bits (b)
m_samples: int, # Number of samples (total hashes)
p: Decimal = Decimal(0.5), # Geometric distribution parameter (typically 0.5)
saturation: bool
) -> Decimal:
# Effective number of bits in the hash (excluding the flag bit)
effective_bits = m_bits - 1

# Maximum value 2^b - 1 (this is the maximum number of leading zeros that can be represented)
max_leading_zeros = (2 ** b_geometric_bits) - 1

# Number of non-uniform bits that remain after subtracting the always-uniform bits and the flag bit
remaining_bits = effective_bits - n_uniform_bits

# Calculate how many bits are taken based on the value of leading zeros
# Case 1: 2^b <= remaining_bits (leading zeros and additional bits are used)
# Here, all remaining bits are directly taken from the uniform hash
n_total_bits_case1 = n_uniform_bits + remaining_bits
# Case 2: 2^b > remaining_bits (write leading zeros using b bits, and remaining uniform bits)
n_total_bits_case2 = n_uniform_bits + (remaining_bits - b_geometric_bits)

# Probabilities for each case (geometric distribution PMF)
P_case1 = Decimal(min(max_leading_zeros + 1, remaining_bits)) / (max_leading_zeros + 1)
P_case2 = Decimal(1) - P_case1

# Number of possible distinct values for case 1
N_case1 = Decimal(2 ** n_total_bits_case1)

# Number of possible distinct values for case 2
N_case2 = Decimal(2 ** n_total_bits_case2)

# Expected number of unique values for case 1
expected_unique_case1 = N_case1 * (1 - (1 - Decimal(1) / N_case1) ** m_samples)

# Expected number of unique values for case 2
expected_unique_case2 = N_case2 * (1 - (1 - Decimal(1) / N_case2) ** m_samples)

# Weighted average of expected unique values based on the probabilities
expected_unique = (P_case1 * expected_unique_case1) + (P_case2 * expected_unique_case2)
"""Computes the expected number of collisions for the hybrid hash."""
if m_samples < 2:
return 0

# Expected collisions is the total number of samples minus expected unique values
expected_collisions = Decimal(m_samples) - expected_unique
# First, we need to determine the number of hashes that can be
# stored in the provided hash size, considering that part of the
# hash size is composed by the geometric bits. Furthermore, we also
# need to consider that when the value stored in the geometric bits
# is less or equal to the number of geometric bits, we will store it
# using unary encoding, and use the newly freed bits to store more
# uniform bits, hence expanding the number of hashes that can be stored.

# We determine the probability P(value of geometric bits <= geometric_bits)
if saturation:
p_geometric_bits = Decimal(0.0)
else:
uniform_bits = uniform_bits - 1
p_geometric_bits = Decimal(geometric_bits + 1.0) / Decimal(2.0**geometric_bits)

# The entropy provided by a geometric distribution with probability p = 1/2
# is 2. Hence, we can store 2 bits in the geometric bits.

# We weight the contributions of the possible values of the hash given the
# case where:

number_of_hashes = (
# value of geometric bits <= geometric_bits
Decimal(2**(uniform_bits + geometric_bits)) * p_geometric_bits +
# value of geometric bits > geometric_bits
Decimal(2**(uniform_bits + 2)) * (Decimal(1.0) - p_geometric_bits)
)

return expected_collisions
# We compute the expected number of collisions
return Decimal(m_samples) * Decimal(m_samples - 1) / Decimal(2 * number_of_hashes)


def plot_errors():
"""Plots the expected error rate of HyperLogLog and the collision probability of the hybrid hash."""
# Precision values for HyperLogLog
precisions = list(range(4, 19))
bits = [4, 5, 6]

# We make two plots: one with the absolute errors, and the other one with the errors
# normalized by the expected hyperloglog error rate
fig, axes = plt.subplots(1, 1, figsize=(10, 10), dpi=100)
fig, axes = plt.subplots(1, len(bits), figsize=(10 * len(bits), 10), dpi=100)

# Compute the expected error rate of HyperLogLog
hll_error_rates = [compute_hll_error_rate(precision) for precision in precisions]

# Plot the expected error rate of HyperLogLog
axes.plot(
precisions, hll_error_rates, label="HyperLogLog", color="blue", marker="o"
)
for ax in axes:
ax.plot(
precisions, hll_error_rates, label="HyperLogLog", color="blue", marker="o"
)

# Compute the collision probability of the hybrid hash
for hash_size in (8, 16, 24, 32, 40, 48, 56, 64):
for number_of_bits in (4, 5, 6):
mean_collisions = []
for precision in precisions:
if number_of_bits + precision > hash_size:
continue
# We store the error rates for each combination of hash size, precisions and number of bits
# so that we can afterwards plot for each bit size the smallest hash size that still beats
# the HyperLogLog error rate
error_rates: Dict[int, List[Tuple[float, int, int]]] = {}
error_rates_best_non_zero: Dict[int, List[Tuple[float, int, int]]] = {}
immediately_after_error_rates: Dict[int, List[Tuple[float, int, int]]] = {}

# For each bit size
for number_of_bits in (4, 5, 6):
# We create a list with the error rate with smallest hash size
# that still beats the HyperLogLog error rate
bit_error_rates: List[Tuple[float, int, int]] = []
bit_immediately_after_error_rates: List[Tuple[float, int, int]] = []
bit_error_rates_best_non_zero: List[Tuple[float, int, int]] = []
for precision in precisions:
best_hash_size = 0
best_error_rate = 1.0
for hash_size in reversed(range(precision + number_of_bits, 33)):
maximal_number_of_hashes = int(
float((2**precision) * number_of_bits) / float(hash_size)
)
expected_collisions = compute_expected_collisions(
m_bits=hash_size,
n_uniform_bits=precision,
b_geometric_bits=number_of_bits,
uniform_bits=hash_size - number_of_bits,
geometric_bits=number_of_bits,
m_samples=maximal_number_of_hashes,
saturation=precision + number_of_bits == hash_size
)
mean_collisions.append(expected_collisions / maximal_number_of_hashes)

# Plot the collision probability of the hybrid hash
axes.plot(
precisions[:len(mean_collisions)],
mean_collisions,
label=f"(m={hash_size}, b={number_of_bits})",
marker="o",
error_rate = expected_collisions / maximal_number_of_hashes

if error_rate > hll_error_rates[precision - 4] / 100 and len(bit_error_rates_best_non_zero) < precision - 3:
bit_error_rates_best_non_zero.append((error_rate, precision, hash_size))

if error_rate >= hll_error_rates[precision - 4]:
bit_immediately_after_error_rates.append((error_rate, precision, hash_size))
break

best_hash_size = hash_size
best_error_rate = error_rate

if best_error_rate < hll_error_rates[precision - 4]:
bit_error_rates.append((best_error_rate, precision, best_hash_size))

error_rates[number_of_bits] = bit_error_rates
immediately_after_error_rates[number_of_bits] = bit_immediately_after_error_rates
error_rates_best_non_zero[number_of_bits] = bit_error_rates_best_non_zero

# Next, we plot the error rates for each bit size, including also
# the annotation at each datapoint of which hash size is being used
for (number_of_bits, ax) in zip(bits, axes):
bit_error_rates = error_rates[number_of_bits]
bit_immediately_after_error_rates = immediately_after_error_rates[number_of_bits]
bit_error_rates_best_non_zero = error_rates_best_non_zero[number_of_bits]

assert len(bit_immediately_after_error_rates) > 0

ax.plot(
[precision for _, precision, _ in bit_error_rates],
[error_rate for error_rate, _, _ in bit_error_rates],
label=f"Smallest viable hash (b={number_of_bits})",
marker="o",
)
ax.plot(
[precision for _, precision, _ in bit_immediately_after_error_rates],
[error_rate for error_rate, _, _ in bit_immediately_after_error_rates],
label=f"Immediately after (b={number_of_bits})",
marker="o",
)

ax.plot(
[precision for _, precision, _ in bit_error_rates_best_non_zero],
[error_rate for error_rate, _, _ in bit_error_rates_best_non_zero],
label=f"Largest sensible hash (b={number_of_bits})",
marker="o",
)

for (error_rate, precision, hash_size) in bit_error_rates:
ax.annotate(
f"u{hash_size}",
(precision, error_rate),
textcoords="offset points",
xytext=(0, 5),
ha="center",
)

for (error_rate, precision, hash_size) in bit_immediately_after_error_rates:
ax.annotate(
f"u{hash_size}",
(precision, error_rate),
textcoords="offset points",
xytext=(0, 5),
ha="center",
)

# Set the title and labels
axes.set_title("Error Rate vs Precision")
axes.set_xlabel("Precision")
axes.set_ylabel("Error Rate")
axes.set_yscale("log")
axes.legend(ncol=2)
for (error_rate, precision, hash_size) in bit_error_rates_best_non_zero:
ax.annotate(
f"u{hash_size}",
(precision, error_rate),
textcoords="offset points",
xytext=(0, 5),
ha="center",
)

# Set the grid
axes.grid(True, which="both", linestyle="--")
# Set the title and labels
ax.set_title("Error Rate vs Precision")
ax.set_xlabel("Precision")
ax.set_ylabel("Error Rate")
ax.set_yscale("log")
ax.legend()

# Set the grid
ax.grid(True, which="both", linestyle="--")

fig.tight_layout()

Expand Down Expand Up @@ -157,10 +235,10 @@ def hybrid_approach_absolute_error(cardinality: int, precision: int, bits: int,
continue

expected_collisions = compute_expected_collisions(
m_bits=hash_size,
n_uniform_bits=precision,
b_geometric_bits=6,
m_samples=maximal_number_of_hashes
uniform_bits=hash_size - 6,
geometric_bits=6,
m_samples=maximal_number_of_hashes,
saturation=precision + 6 == hash_size
)
if expected_collisions / cardinality < hyper_log_log_error_rate_at_precision:
hash_size_identified = hash_size
Expand Down
Binary file modified scripts/error_rate_vs_collision_probability.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/composite_hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ mod test_composite_hash {
let encoded_hash = CH::encode(index, register, original_hash, hash_bits);

// We check that the encoded hash indeed fits within the provided number of bits.
assert_eq!(encoded_hash >> hash_bits, 0);
assert_eq!(encoded_hash >> hash_bits, 0, "The encoded hash is too large for the provided number of bits.");

let (decoded_register, decoded_index) = CH::decode(encoded_hash, hash_bits);
assert_eq!(register, decoded_register, "Failed to recover the register at hash bits {hash_bits}. The hash is {encoded_hash:b}. The fake hash is {original_hash:064b}.");
Expand Down
3 changes: 2 additions & 1 deletion src/composite_hash/switch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ impl<P: Precision, B: Bits> CompositeHash for SwitchHash<P, B> {
let mut composite_hash = (index as u64) << (hash_bits - Self::Precision::EXPONENT);

// If the hash barely fits as-is, we do not need to do anything special.
// ERROR! TODO! FIX!
if Self::Precision::EXPONENT + Self::Bits::NUMBER_OF_BITS == hash_bits {
composite_hash |= u64::from(register);
return composite_hash;
Expand All @@ -260,7 +261,7 @@ impl<P: Precision, B: Bits> CompositeHash for SwitchHash<P, B> {
// we are currently encoding minus the bits used for the index, instead of
// storing the register value after the index, we store the higher bits of
// the original hash after the index.
if register > Self::Bits::NUMBER_OF_BITS {
if register - 1 > Self::Bits::NUMBER_OF_BITS {
// In this case, the composite hash has the following structure:
//
// [index (exponent bits) | flag = 1 | registers (bitsize) | hash remainder]
Expand Down

0 comments on commit d2806f0

Please sign in to comment.