Skip to content

Commit

Permalink
feat(cost-model): Basic cost model computation (#40)
Browse files Browse the repository at this point in the history
* add basic cost computation

---------

Co-authored-by: Yuanxin Cao <[email protected]>
  • Loading branch information
lanlou1554 and xx01cyx authored Nov 19, 2024
1 parent db9dbbe commit efd01f8
Show file tree
Hide file tree
Showing 22 changed files with 2,431 additions and 102 deletions.
708 changes: 705 additions & 3 deletions Cargo.lock

Large diffs are not rendered by default.

675 changes: 656 additions & 19 deletions optd-cost-model/Cargo.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions optd-cost-model/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
name = "optd-cost-model"
version = "0.1.0"
edition = "2021"
authors = ["Yuanxin Cao", "Lan Lou", "Kunle Li"]

[dependencies]
optd-persistent = { path = "../optd-persistent", version = "0.1" }
Expand All @@ -10,10 +11,15 @@ serde_json = "1.0"
serde_with = { version = "3.7.0", features = ["json"] }
arrow-schema = "53.2.0"
datafusion-expr = "32.0.0"
datafusion = "32.0.0"
ordered-float = "4.0"
chrono = "0.4"
itertools = "0.13"
assert_approx_eq = "1.1.0"
trait-variant = "0.1.2"
tokio = { version = "1.0.1", features = ["macros", "rt-multi-thread"] }

[dev-dependencies]
crossbeam = "0.8"
rand = "0.8"
test-case = "3.3"
202 changes: 202 additions & 0 deletions optd-cost-model/src/cost/agg.rs
Original file line number Diff line number Diff line change
@@ -1 +1,203 @@
use crate::{
common::{
nodes::{ArcPredicateNode, PredicateType, ReprPredicateNode},
predicates::{attr_index_pred::AttrIndexPred, list_pred::ListPred},
properties::attr_ref::{AttrRef, BaseTableAttrRef},
types::GroupId,
},
cost_model::CostModelImpl,
stats::DEFAULT_NUM_DISTINCT,
storage::CostModelStorageManager,
CostModelError, CostModelResult, EstimatedStatistic, SemanticError,
};

impl<S: CostModelStorageManager> CostModelImpl<S> {
pub async fn get_agg_row_cnt(
&self,
group_id: GroupId,
group_by: ArcPredicateNode,
) -> CostModelResult<EstimatedStatistic> {
let group_by = ListPred::from_pred_node(group_by).unwrap();
if group_by.is_empty() {
Ok(EstimatedStatistic(1.0))
} else {
// Multiply the n-distinct of all the group by columns.
// TODO: improve with multi-dimensional n-distinct
let mut row_cnt = 1;

for node in &group_by.0.children {
match node.typ {
PredicateType::AttrIndex => {
let attr_ref =
AttrIndexPred::from_pred_node(node.clone()).ok_or_else(|| {
SemanticError::InvalidPredicate(
"Expected AttributeRef predicate".to_string(),
)
})?;
if let AttrRef::BaseTableAttrRef(BaseTableAttrRef { table_id, attr_idx }) =
self.memo.get_attribute_ref(group_id, attr_ref.attr_index())
{
// TODO: Only query ndistinct instead of all kinds of stats.
let stats_option =
self.get_attribute_comb_stats(table_id, &[attr_idx]).await?;

let ndistinct = match stats_option {
Some(stats) => stats.ndistinct,
None => {
// The column type is not supported or stats are missing.
DEFAULT_NUM_DISTINCT
}
};
row_cnt *= ndistinct;
} else {
// TOOD: Handle derived attributes.
row_cnt *= DEFAULT_NUM_DISTINCT;
}
}
_ => {
// TODO: Consider the case where `GROUP BY 1`.
panic!("GROUP BY must have attribute ref predicate");
}
}
}
Ok(EstimatedStatistic(row_cnt as f64))
}
}
}

#[cfg(test)]
mod tests {
use std::{collections::HashMap, ops::Deref};

use crate::{
common::{
predicates::constant_pred::ConstantType,
properties::Attribute,
types::{GroupId, TableId},
values::Value,
},
cost_model::tests::{
attr_index, cnst, create_mock_cost_model, create_mock_cost_model_with_attr_types,
empty_list, empty_per_attr_stats, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX,
TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID,
},
stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT},
EstimatedStatistic,
};

#[tokio::test]
async fn test_agg_no_stats() {
let cost_model = create_mock_cost_model_with_attr_types(
vec![TEST_TABLE1_ID],
vec![],
vec![HashMap::from([
(TEST_ATTR1_BASE_INDEX, ConstantType::Int32),
(TEST_ATTR2_BASE_INDEX, ConstantType::Int32),
])],
vec![None],
);

// Group by empty list should return 1.
let group_bys = empty_list();
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic(1.0)
);

// Group by single column should return the default value since there are no stats.
let group_bys = list(vec![attr_index(0)]);
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic(DEFAULT_NUM_DISTINCT as f64)
);

// Group by two columns should return the default value squared since there are no stats.
let group_bys = list(vec![attr_index(0), attr_index(1)]);
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic((DEFAULT_NUM_DISTINCT * DEFAULT_NUM_DISTINCT) as f64)
);
}

#[tokio::test]
async fn test_agg_with_stats() {
let attr1_ndistinct = 12;
let attr2_ndistinct = 645;
let attr1_stats = TestPerAttributeStats::new(
MostCommonValues::SimpleFrequency(SimpleMap::default()),
None,
attr1_ndistinct,
0.0,
);
let attr2_stats = TestPerAttributeStats::new(
MostCommonValues::SimpleFrequency(SimpleMap::default()),
None,
attr2_ndistinct,
0.0,
);

let cost_model = create_mock_cost_model_with_attr_types(
vec![TEST_TABLE1_ID],
vec![HashMap::from([
(TEST_ATTR1_BASE_INDEX, attr1_stats),
(TEST_ATTR2_BASE_INDEX, attr2_stats),
])],
vec![HashMap::from([
(TEST_ATTR1_BASE_INDEX, ConstantType::Int32),
(TEST_ATTR2_BASE_INDEX, ConstantType::Int32),
(TEST_ATTR3_BASE_INDEX, ConstantType::Int32),
])],
vec![None],
);

// Group by empty list should return 1.
let group_bys = empty_list();
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic(1.0)
);

// Group by single column should return the n-distinct of the column.
let group_bys = list(vec![attr_index(0)]);
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic(attr1_ndistinct as f64)
);

// Group by two columns should return the product of the n-distinct of the columns.
let group_bys = list(vec![attr_index(0), attr_index(1)]);
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic((attr1_ndistinct * attr2_ndistinct) as f64)
);

// Group by multiple columns should return the product of the n-distinct of the columns. If one of the columns
// does not have stats, it should use the default value instead.
let group_bys = list(vec![attr_index(0), attr_index(1), attr_index(2)]);
assert_eq!(
cost_model
.get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
.await
.unwrap(),
EstimatedStatistic((attr1_ndistinct * attr2_ndistinct * DEFAULT_NUM_DISTINCT) as f64)
);
}
}
1 change: 0 additions & 1 deletion optd-cost-model/src/cost/filter.rs
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@

1 change: 0 additions & 1 deletion optd-cost-model/src/cost/join.rs
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@

28 changes: 28 additions & 0 deletions optd-cost-model/src/cost/limit.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use crate::{
common::{
nodes::{ArcPredicateNode, ReprPredicateNode},
predicates::constant_pred::ConstantPred,
},
cost_model::CostModelImpl,
storage::CostModelStorageManager,
CostModelResult, EstimatedStatistic,
};

impl<S: CostModelStorageManager> CostModelImpl<S> {
pub(crate) fn get_limit_row_cnt(
&self,
child_row_cnt: EstimatedStatistic,
fetch_expr: ArcPredicateNode,
) -> CostModelResult<EstimatedStatistic> {
let fetch = ConstantPred::from_pred_node(fetch_expr)
.unwrap()
.value()
.as_u64();
// u64::MAX represents None
if fetch == u64::MAX {
Ok(child_row_cnt)
} else {
Ok(EstimatedStatistic(child_row_cnt.0.min(fetch as f64)))
}
}
}
3 changes: 3 additions & 0 deletions optd-cost-model/src/cost/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#![allow(unused)]

pub mod agg;
pub mod filter;
pub mod join;
pub mod limit;
Loading

0 comments on commit efd01f8

Please sign in to comment.