Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ filter selectivity estimation #2268

Open
joocer opened this issue Jan 18, 2025 · 0 comments
Open

✨ filter selectivity estimation #2268

joocer opened this issue Jan 18, 2025 · 0 comments

Comments

@joocer
Copy link
Contributor

joocer commented Jan 18, 2025

import numpy as np
import math

def estimate_selectivity(
    lower: float, upper: float, total_records: int, cardinality: int, value: float, filter_type: str
) -> float:
    """
    Estimates filter selectivity given column bounds, record count, cardinality, and a filter condition.

    Parameters:
        lower (float): Lower bound of column values.
        upper (float): Upper bound of column values.
        total_records (int): Total number of records.
        cardinality (int): Estimated unique values.
        value (float): Filter value.
        filter_type (str): Type of filter ('=', '>=', '<=', '>', '<').

    Returns:
        float: Estimated selectivity (fraction of records).
    """
    
    # Ensure valid range
    range_width = max((upper - lower), 1e-9)  # Avoid division by zero
    
    # Uniform Distribution PDF and CDF
    uniform_pdf = 1 / range_width  # Even spread
    uniform_cdf = lambda x: max(0, min((x - lower) / range_width, 1))  # CDF approximation
    
    # Exponential Distribution (skewed towards lower bound)
    lambda_exp = 1 / range_width  # Approximate rate
    exp_cdf = lambda x: 1 - math.exp(-lambda_exp * (x - lower)) if x >= lower else 0

    # Normal Distribution (bell curve)
    mean = (upper + lower) / 2
    std_dev = range_width / max(cardinality, 1)  # Approximate spread
    normal_cdf = lambda x: 0.5 * (1 + math.erf((x - mean) / (std_dev * math.sqrt(2))))  # Normal CDF using math.erf

    if filter_type == "=":
        # Approximate P(x = value)
        return min(1, 1 / max(cardinality, 1))  # If cardinality exists, 1/cardinality

    elif filter_type == ">=":
        return 1 - normal_cdf(value)

    elif filter_type == "<=":
        return normal_cdf(value)

    elif filter_type == ">":
        return 1 - normal_cdf(value)

    elif filter_type == "<":
        return normal_cdf(value)

    return 0  # Default fallback

# Example Usage
lower = 10
upper = 100
total_records = 1000000
cardinality = 10000
value = 50

print(f"P(x = {value}):", estimate_selectivity(lower, upper, total_records, cardinality, value, "="))
print(f"P(x >= {value}):", estimate_selectivity(lower, upper, total_records, cardinality, value, ">="))
print(f"P(x <= {value}):", estimate_selectivity(lower, upper, total_records, cardinality, value, "<="))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant