feat(cost-model): Basic cost model computation (#40)

* add basic cost computation --------- Co-authored-by: Yuanxin Cao <[email protected]>
cmu-db · Nov 19, 2024 · efd01f8 · efd01f8
1 parent db9dbbe
commit efd01f8
Show file tree

Hide file tree

Showing 22 changed files with 2,431 additions and 102 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/optd-cost-model/Cargo.lock b/optd-cost-model/Cargo.lock
diff --git a/optd-cost-model/Cargo.toml b/optd-cost-model/Cargo.toml
@@ -2,6 +2,7 @@
 name = "optd-cost-model"
 version = "0.1.0"
 edition = "2021"
+authors = ["Yuanxin Cao", "Lan Lou", "Kunle Li"]
 
 [dependencies]
 optd-persistent = { path = "../optd-persistent", version = "0.1" }
@@ -10,10 +11,15 @@ serde_json = "1.0"
 serde_with = { version = "3.7.0", features = ["json"] }
 arrow-schema = "53.2.0"
 datafusion-expr = "32.0.0"
+datafusion = "32.0.0"
 ordered-float = "4.0"
 chrono = "0.4"
 itertools = "0.13"
+assert_approx_eq = "1.1.0"
+trait-variant = "0.1.2"
+tokio = { version = "1.0.1", features = ["macros", "rt-multi-thread"] }
 
 [dev-dependencies]
 crossbeam = "0.8"
 rand = "0.8"
+test-case = "3.3"
diff --git a/optd-cost-model/src/cost/agg.rs b/optd-cost-model/src/cost/agg.rs
@@ -1 +1,203 @@
+use crate::{
+    common::{
+        nodes::{ArcPredicateNode, PredicateType, ReprPredicateNode},
+        predicates::{attr_index_pred::AttrIndexPred, list_pred::ListPred},
+        properties::attr_ref::{AttrRef, BaseTableAttrRef},
+        types::GroupId,
+    },
+    cost_model::CostModelImpl,
+    stats::DEFAULT_NUM_DISTINCT,
+    storage::CostModelStorageManager,
+    CostModelError, CostModelResult, EstimatedStatistic, SemanticError,
+};
 
+impl<S: CostModelStorageManager> CostModelImpl<S> {
+    pub async fn get_agg_row_cnt(
+        &self,
+        group_id: GroupId,
+        group_by: ArcPredicateNode,
+    ) -> CostModelResult<EstimatedStatistic> {
+        let group_by = ListPred::from_pred_node(group_by).unwrap();
+        if group_by.is_empty() {
+            Ok(EstimatedStatistic(1.0))
+        } else {
+            // Multiply the n-distinct of all the group by columns.
+            // TODO: improve with multi-dimensional n-distinct
+            let mut row_cnt = 1;
+
+            for node in &group_by.0.children {
+                match node.typ {
+                    PredicateType::AttrIndex => {
+                        let attr_ref =
+                            AttrIndexPred::from_pred_node(node.clone()).ok_or_else(|| {
+                                SemanticError::InvalidPredicate(
+                                    "Expected AttributeRef predicate".to_string(),
+                                )
+                            })?;
+                        if let AttrRef::BaseTableAttrRef(BaseTableAttrRef { table_id, attr_idx }) =
+                            self.memo.get_attribute_ref(group_id, attr_ref.attr_index())
+                        {
+                            // TODO: Only query ndistinct instead of all kinds of stats.
+                            let stats_option =
+                                self.get_attribute_comb_stats(table_id, &[attr_idx]).await?;
+
+                            let ndistinct = match stats_option {
+                                Some(stats) => stats.ndistinct,
+                                None => {
+                                    // The column type is not supported or stats are missing.
+                                    DEFAULT_NUM_DISTINCT
+                                }
+                            };
+                            row_cnt *= ndistinct;
+                        } else {
+                            // TOOD: Handle derived attributes.
+                            row_cnt *= DEFAULT_NUM_DISTINCT;
+                        }
+                    }
+                    _ => {
+                        // TODO: Consider the case where `GROUP BY 1`.
+                        panic!("GROUP BY must have attribute ref predicate");
+                    }
+                }
+            }
+            Ok(EstimatedStatistic(row_cnt as f64))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, ops::Deref};
+
+    use crate::{
+        common::{
+            predicates::constant_pred::ConstantType,
+            properties::Attribute,
+            types::{GroupId, TableId},
+            values::Value,
+        },
+        cost_model::tests::{
+            attr_index, cnst, create_mock_cost_model, create_mock_cost_model_with_attr_types,
+            empty_list, empty_per_attr_stats, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX,
+            TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID,
+        },
+        stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT},
+        EstimatedStatistic,
+    };
+
+    #[tokio::test]
+    async fn test_agg_no_stats() {
+        let cost_model = create_mock_cost_model_with_attr_types(
+            vec![TEST_TABLE1_ID],
+            vec![],
+            vec![HashMap::from([
+                (TEST_ATTR1_BASE_INDEX, ConstantType::Int32),
+                (TEST_ATTR2_BASE_INDEX, ConstantType::Int32),
+            ])],
+            vec![None],
+        );
+
+        // Group by empty list should return 1.
+        let group_bys = empty_list();
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic(1.0)
+        );
+
+        // Group by single column should return the default value since there are no stats.
+        let group_bys = list(vec![attr_index(0)]);
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic(DEFAULT_NUM_DISTINCT as f64)
+        );
+
+        // Group by two columns should return the default value squared since there are no stats.
+        let group_bys = list(vec![attr_index(0), attr_index(1)]);
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic((DEFAULT_NUM_DISTINCT * DEFAULT_NUM_DISTINCT) as f64)
+        );
+    }
+
+    #[tokio::test]
+    async fn test_agg_with_stats() {
+        let attr1_ndistinct = 12;
+        let attr2_ndistinct = 645;
+        let attr1_stats = TestPerAttributeStats::new(
+            MostCommonValues::SimpleFrequency(SimpleMap::default()),
+            None,
+            attr1_ndistinct,
+            0.0,
+        );
+        let attr2_stats = TestPerAttributeStats::new(
+            MostCommonValues::SimpleFrequency(SimpleMap::default()),
+            None,
+            attr2_ndistinct,
+            0.0,
+        );
+
+        let cost_model = create_mock_cost_model_with_attr_types(
+            vec![TEST_TABLE1_ID],
+            vec![HashMap::from([
+                (TEST_ATTR1_BASE_INDEX, attr1_stats),
+                (TEST_ATTR2_BASE_INDEX, attr2_stats),
+            ])],
+            vec![HashMap::from([
+                (TEST_ATTR1_BASE_INDEX, ConstantType::Int32),
+                (TEST_ATTR2_BASE_INDEX, ConstantType::Int32),
+                (TEST_ATTR3_BASE_INDEX, ConstantType::Int32),
+            ])],
+            vec![None],
+        );
+
+        // Group by empty list should return 1.
+        let group_bys = empty_list();
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic(1.0)
+        );
+
+        // Group by single column should return the n-distinct of the column.
+        let group_bys = list(vec![attr_index(0)]);
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic(attr1_ndistinct as f64)
+        );
+
+        // Group by two columns should return the product of the n-distinct of the columns.
+        let group_bys = list(vec![attr_index(0), attr_index(1)]);
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic((attr1_ndistinct * attr2_ndistinct) as f64)
+        );
+
+        // Group by multiple columns should return the product of the n-distinct of the columns. If one of the columns
+        // does not have stats, it should use the default value instead.
+        let group_bys = list(vec![attr_index(0), attr_index(1), attr_index(2)]);
+        assert_eq!(
+            cost_model
+                .get_agg_row_cnt(TEST_GROUP1_ID, group_bys)
+                .await
+                .unwrap(),
+            EstimatedStatistic((attr1_ndistinct * attr2_ndistinct * DEFAULT_NUM_DISTINCT) as f64)
+        );
+    }
+}
diff --git a/optd-cost-model/src/cost/filter.rs b/optd-cost-model/src/cost/filter.rs
@@ -1 +0,0 @@
-

diff --git a/optd-cost-model/src/cost/join.rs b/optd-cost-model/src/cost/join.rs
@@ -1 +0,0 @@
-

diff --git a/optd-cost-model/src/cost/limit.rs b/optd-cost-model/src/cost/limit.rs
@@ -0,0 +1,28 @@
+use crate::{
+    common::{
+        nodes::{ArcPredicateNode, ReprPredicateNode},
+        predicates::constant_pred::ConstantPred,
+    },
+    cost_model::CostModelImpl,
+    storage::CostModelStorageManager,
+    CostModelResult, EstimatedStatistic,
+};
+
+impl<S: CostModelStorageManager> CostModelImpl<S> {
+    pub(crate) fn get_limit_row_cnt(
+        &self,
+        child_row_cnt: EstimatedStatistic,
+        fetch_expr: ArcPredicateNode,
+    ) -> CostModelResult<EstimatedStatistic> {
+        let fetch = ConstantPred::from_pred_node(fetch_expr)
+            .unwrap()
+            .value()
+            .as_u64();
+        // u64::MAX represents None
+        if fetch == u64::MAX {
+            Ok(child_row_cnt)
+        } else {
+            Ok(EstimatedStatistic(child_row_cnt.0.min(fetch as f64)))
+        }
+    }
+}
diff --git a/optd-cost-model/src/cost/mod.rs b/optd-cost-model/src/cost/mod.rs
@@ -1,3 +1,6 @@
+#![allow(unused)]
+
 pub mod agg;
 pub mod filter;
 pub mod join;
+pub mod limit;