✨ filter selectivity estimation #2268

joocer · 2025-01-18T08:42:58Z

import numpy as np
import math

def estimate_selectivity(
    lower: float, upper: float, total_records: int, cardinality: int, value: float, filter_type: str
) -> float:
    """
    Estimates filter selectivity given column bounds, record count, cardinality, and a filter condition.

    Parameters:
        lower (float): Lower bound of column values.
        upper (float): Upper bound of column values.
        total_records (int): Total number of records.
        cardinality (int): Estimated unique values.
        value (float): Filter value.
        filter_type (str): Type of filter ('=', '>=', '<=', '>', '<').

    Returns:
        float: Estimated selectivity (fraction of records).
    """
    
    # Ensure valid range
    range_width = max((upper - lower), 1e-9)  # Avoid division by zero
    
    # Uniform Distribution PDF and CDF
    uniform_pdf = 1 / range_width  # Even spread
    uniform_cdf = lambda x: max(0, min((x - lower) / range_width, 1))  # CDF approximation
    
    # Exponential Distribution (skewed towards lower bound)
    lambda_exp = 1 / range_width  # Approximate rate
    exp_cdf = lambda x: 1 - math.exp(-lambda_exp * (x - lower)) if x >= lower else 0

    # Normal Distribution (bell curve)
    mean = (upper + lower) / 2
    std_dev = range_width / max(cardinality, 1)  # Approximate spread
    normal_cdf = lambda x: 0.5 * (1 + math.erf((x - mean) / (std_dev * math.sqrt(2))))  # Normal CDF using math.erf

    if filter_type == "=":
        # Approximate P(x = value)
        return min(1, 1 / max(cardinality, 1))  # If cardinality exists, 1/cardinality

    elif filter_type == ">=":
        return 1 - normal_cdf(value)

    elif filter_type == "<=":
        return normal_cdf(value)

    elif filter_type == ">":
        return 1 - normal_cdf(value)

    elif filter_type == "<":
        return normal_cdf(value)

    return 0  # Default fallback

# Example Usage
lower = 10
upper = 100
total_records = 1000000
cardinality = 10000
value = 50

print(f"P(x = {value}):", estimate_selectivity(lower, upper, total_records, cardinality, value, "="))
print(f"P(x >= {value}):", estimate_selectivity(lower, upper, total_records, cardinality, value, ">="))
print(f"P(x <= {value}):", estimate_selectivity(lower, upper, total_records, cardinality, value, "<="))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

✨ filter selectivity estimation #2268

✨ filter selectivity estimation #2268

joocer commented Jan 18, 2025

✨ filter selectivity estimation #2268

✨ filter selectivity estimation #2268

Comments

joocer commented Jan 18, 2025