Skip to content

Commit

Permalink
Add filter implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
lanlou1554 committed Nov 19, 2024
1 parent efd01f8 commit 795ca1f
Show file tree
Hide file tree
Showing 9 changed files with 1,796 additions and 0 deletions.
Empty file removed optd-cost-model/src/cost/filter.rs
Empty file.
183 changes: 183 additions & 0 deletions optd-cost-model/src/cost/filter/attribute.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
use std::ops::Bound;

use crate::{
common::{types::TableId, values::Value},
cost_model::CostModelImpl,
stats::{AttributeCombValue, AttributeCombValueStats, DEFAULT_EQ_SEL, DEFAULT_INEQ_SEL},
storage::CostModelStorageManager,
CostModelResult,
};

impl<S: CostModelStorageManager> CostModelImpl<S> {
/// Get the selectivity of an expression of the form "attribute equals value" (or "value equals
/// attribute") Will handle the case of statistics missing
/// Equality predicates are handled entirely differently from range predicates so this is its
/// own function
/// Also, get_attribute_equality_selectivity is a subroutine when computing range
/// selectivity, which is another reason for separating these into two functions
/// is_eq means whether it's == or !=
///
/// Currently, we only support calculating the equality selectivity for an existed attribute,
/// not a derived attribute.
/// TODO: Support derived attributes.
pub(crate) async fn get_attribute_equality_selectivity(
&self,
table_id: TableId,
attr_base_index: u64,
value: &Value,
is_eq: bool,
) -> CostModelResult<f64> {
let ret_sel = {
if let Some(attribute_stats) = self
.get_attribute_comb_stats(table_id, &[attr_base_index])
.await?
{
let eq_freq =
if let Some(freq) = attribute_stats.mcvs.freq(&vec![Some(value.clone())]) {
freq
} else {
let non_mcv_freq = 1.0 - attribute_stats.mcvs.total_freq();
// always safe because usize is at least as large as i32
let ndistinct_as_usize = attribute_stats.ndistinct as usize;
let non_mcv_cnt = ndistinct_as_usize - attribute_stats.mcvs.cnt();
if non_mcv_cnt == 0 {
return Ok(0.0);
}
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt
// - 1 if null_frac > 0
(non_mcv_freq - attribute_stats.null_frac) / (non_mcv_cnt as f64)
};
if is_eq {
eq_freq
} else {
1.0 - eq_freq - attribute_stats.null_frac
}
} else {
#[allow(clippy::collapsible_else_if)]
if is_eq {
DEFAULT_EQ_SEL
} else {
1.0 - DEFAULT_EQ_SEL
}
}
};

assert!(
(0.0..=1.0).contains(&ret_sel),
"ret_sel ({}) should be in [0, 1]",
ret_sel
);
Ok(ret_sel)
}

/// Compute the frequency of values in a attribute less than or equal to the given value.
fn get_attribute_leq_value_freq(
per_attribute_stats: &AttributeCombValueStats,
value: &Value,
) -> f64 {
// because distr does not include the values in MCVs, we need to compute the CDFs there as
// well because nulls return false in any comparison, they are never included when
// computing range selectivity
let distr_leq_freq = per_attribute_stats.distr.as_ref().unwrap().cdf(value);
let value = value.clone();
let pred = Box::new(move |val: &AttributeCombValue| *val[0].as_ref().unwrap() <= value);
let mcvs_leq_freq = per_attribute_stats.mcvs.freq_over_pred(pred);
let ret_freq = distr_leq_freq + mcvs_leq_freq;
assert!(
(0.0..=1.0).contains(&ret_freq),
"ret_freq ({}) should be in [0, 1]",
ret_freq
);
ret_freq
}

/// Compute the frequency of values in a attribute less than the given value.
///
/// Currently, we only support calculating the equality selectivity for an existed attribute,
/// not a derived attribute.
/// TODO: Support derived attributes.
async fn get_attribute_lt_value_freq(
&self,
attribute_stats: &AttributeCombValueStats,
table_id: TableId,
attr_base_index: u64,
value: &Value,
) -> CostModelResult<f64> {
// depending on whether value is in mcvs or not, we use different logic to turn total_lt_cdf
// into total_leq_cdf this logic just so happens to be the exact same logic as
// get_attribute_equality_selectivity implements
let ret_freq = Self::get_attribute_leq_value_freq(attribute_stats, value)
- self
.get_attribute_equality_selectivity(table_id, attr_base_index, value, true)
.await?;
assert!(
(0.0..=1.0).contains(&ret_freq),
"ret_freq ({}) should be in [0, 1]",
ret_freq
);
Ok(ret_freq)
}

/// Get the selectivity of an expression of the form "attribute </<=/>=/> value" (or "value
/// </<=/>=/> attribute"). Computes selectivity based off of statistics.
/// Range predicates are handled entirely differently from equality predicates so this is its
/// own function. If it is unable to find the statistics, it returns DEFAULT_INEQ_SEL.
/// The selectivity is computed as quantile of the right bound minus quantile of the left bound.
///
/// Currently, we only support calculating the equality selectivity for an existed attribute,
/// not a derived attribute.
/// TODO: Support derived attributes.
pub(crate) async fn get_attribute_range_selectivity(
&self,
table_id: TableId,
attr_base_index: u64,
start: Bound<&Value>,
end: Bound<&Value>,
) -> CostModelResult<f64> {
// TODO: Consider attribute is a derived attribute
if let Some(attribute_stats) = self
.get_attribute_comb_stats(table_id, &[attr_base_index])
.await?
{
let left_quantile = match start {
Bound::Unbounded => 0.0,
Bound::Included(value) => {
self.get_attribute_lt_value_freq(
&attribute_stats,
table_id,
attr_base_index,
value,
)
.await?
}
Bound::Excluded(value) => {
Self::get_attribute_leq_value_freq(&attribute_stats, value)
}
};
let right_quantile = match end {
Bound::Unbounded => 1.0,
Bound::Included(value) => {
Self::get_attribute_leq_value_freq(&attribute_stats, value)
}
Bound::Excluded(value) => {
self.get_attribute_lt_value_freq(
&attribute_stats,
table_id,
attr_base_index,
value,
)
.await?
}
};
assert!(
left_quantile <= right_quantile,
"left_quantile ({}) should be <= right_quantile ({})",
left_quantile,
right_quantile
);
Ok(right_quantile - left_quantile)
} else {
Ok(DEFAULT_INEQ_SEL)
}
}
}
Loading

0 comments on commit 795ca1f

Please sign in to comment.