perf: Fix join cost estimates (#3831)

This PR fixes join cost estimates in the following ways: - Uses number of rows instead of size in bytes for making probe side decisions - Decreases the aggressiveness of cardinality reductions when there is an `is not NULL` filter - Decreases the aggressiveness of inequality filters, especially since these often appear as pairs of a "between" clause - Increases the aggressiveness of exact equality filters. - For join edges where there are more than one join condition, adjust cost estimations so that we compute the total domain as `|(key1, key2, ...)| ~= min(|left side|, |right side|)`. With multiple join conditions, we know that the join is not a pk-fk join on each join key. Rather, it's a pk-fk join on the tuple of join keys, which we estimate as the cardinality of the smaller table of the join.
Eventual-Inc · Feb 28, 2025 · 2351ba4 · 2351ba4
1 parent b830647
commit 2351ba4
Show file tree

Hide file tree

Showing 4 changed files with 234 additions and 67 deletions.
diff --git a/src/daft-dsl/src/expr/mod.rs b/src/daft-dsl/src/expr/mod.rs
@@ -1586,10 +1586,10 @@ pub fn estimated_selectivity(expr: &Expr, schema: &Schema) -> f64 {
             let right_selectivity = estimated_selectivity(right, schema);
             match op {
                 // Fixed selectivity for all common comparisons
-                Operator::Eq => 0.1,
-                Operator::EqNullSafe => 0.1,
-                Operator::NotEq => 0.9,
-                Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => 0.2,
+                Operator::Eq => 0.05,
+                Operator::EqNullSafe => 0.05,
+                Operator::NotEq => 0.95,
+                Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => 0.5,
 
                 // Logical operators with fixed estimates
                 // P(A and B) = P(A) * P(B)
@@ -1619,8 +1619,8 @@ pub fn estimated_selectivity(expr: &Expr, schema: &Schema) -> f64 {
         Expr::Not(expr) => 1.0 - estimated_selectivity(expr, schema),
 
         // Fixed selectivity for IS NULL and IS NOT NULL, assume not many nulls
-        Expr::IsNull(_) => 0.1,
-        Expr::NotNull(_) => 0.9,
+        Expr::IsNull(_) => 0.05,
+        Expr::NotNull(_) => 0.95,
 
         // All membership operations use same selectivity
         Expr::IsIn(_, _) | Expr::Between(_, _, _) | Expr::InSubquery(_, _) | Expr::Exists(_) => 0.2,

diff --git a/src/daft-local-execution/src/pipeline.rs b/src/daft-local-execution/src/pipeline.rs
@@ -344,8 +344,8 @@ pub fn physical_plan_to_pipeline(
                         StatsState::Materialized(left_stats),
                         StatsState::Materialized(right_stats),
                     ) => {
-                        let left_size = left_stats.approx_stats.size_bytes;
-                        let right_size = right_stats.approx_stats.size_bytes;
+                        let left_size = left_stats.approx_stats.num_rows;
+                        let right_size = right_stats.approx_stats.num_rows;
                         left_size <= right_size
                     }
                     // If stats are only available on the right side of the join, and the upper bound bytes on the
@@ -363,8 +363,8 @@ pub fn physical_plan_to_pipeline(
                         StatsState::Materialized(left_stats),
                         StatsState::Materialized(right_stats),
                     ) => {
-                        let left_size = left_stats.approx_stats.size_bytes;
-                        let right_size = right_stats.approx_stats.size_bytes;
+                        let left_size = left_stats.approx_stats.num_rows;
+                        let right_size = right_stats.approx_stats.num_rows;
                         right_size as f64 >= left_size as f64 * 1.5
                     }
                     // If stats are only available on the left side of the join, and the upper bound bytes on the left
@@ -382,8 +382,8 @@ pub fn physical_plan_to_pipeline(
                         StatsState::Materialized(left_stats),
                         StatsState::Materialized(right_stats),
                     ) => {
-                        let left_size = left_stats.approx_stats.size_bytes;
-                        let right_size = right_stats.approx_stats.size_bytes;
+                        let left_size = left_stats.approx_stats.num_rows;
+                        let right_size = right_stats.approx_stats.num_rows;
                         (right_size as f64 * 1.5) >= left_size as f64
                     }
                     // If stats are only available on the right side of the join, and the upper bound bytes on the
@@ -401,8 +401,8 @@ pub fn physical_plan_to_pipeline(
                         StatsState::Materialized(left_stats),
                         StatsState::Materialized(right_stats),
                     ) => {
-                        let left_size = left_stats.approx_stats.size_bytes;
-                        let right_size = right_stats.approx_stats.size_bytes;
+                        let left_size = left_stats.approx_stats.num_rows;
+                        let right_size = right_stats.approx_stats.num_rows;
                         right_size as f64 > left_size as f64 * 1.5
                     }
                     // If stats are only available on the left side of the join, and the upper bound bytes on the left

diff --git a/src/daft-logical-plan/src/optimization/rules/reorder_joins/brute_force_join_order.rs b/src/daft-logical-plan/src/optimization/rules/reorder_joins/brute_force_join_order.rs
@@ -241,7 +241,7 @@ mod tests {
             let order = $orderer.order(&graph);
             assert!(JoinOrderTree::order_eq(&order, &$optimal_order));
             // Check that the number of join conditions does not increase due to join edge inference.
-            assert_eq!(JoinOrderTree::num_join_conditions(&order), num_edges);
+            assert!(JoinOrderTree::num_join_conditions(&order) <= num_edges);
         };
     }
 
@@ -290,45 +290,6 @@ mod tests {
         create_and_test_join_order!(nodes, edges, BruteForceJoinOrderer {}, optimal_order);
     }
 
-    #[test]
-    fn test_brute_force_order_minimal2() {
-        // Compared to the previous test, this test has a smaller "large" relation. When joined with "medium" using two join conditions,
-        // the result produces a smaller relation than "small". Hence the join order should be ((large x medium) x small).
-        let nodes = vec![("medium", 1_000), ("large", 5_000), ("small", 500)];
-        let name_to_id = node_to_id_map(nodes.clone());
-        let edges = vec![
-            JoinEdge {
-                node1: name_to_id["medium"],
-                node1_col_name: "m_medium".to_string(),
-                node2: name_to_id["large"],
-                node2_col_name: "l_medium".to_string(),
-                total_domain: 1_000,
-            },
-            JoinEdge {
-                node1: name_to_id["large"],
-                node1_col_name: "l_small".to_string(),
-                node2: name_to_id["small"],
-                node2_col_name: "s_small".to_string(),
-                total_domain: 500,
-            },
-            JoinEdge {
-                node1: name_to_id["medium"],
-                node1_col_name: "m_small".to_string(),
-                node2: name_to_id["small"],
-                node2_col_name: "s_small".to_string(),
-                total_domain: 500,
-            },
-        ];
-        let optimal_order = test_join(
-            test_relation(name_to_id["small"]),
-            test_join(
-                test_relation(name_to_id["large"]),
-                test_relation(name_to_id["medium"]),
-            ),
-        );
-        create_and_test_join_order!(nodes, edges, BruteForceJoinOrderer {}, optimal_order);
-    }
-
     #[test]
     fn test_brute_force_order_mock_tpch_q5() {
         let nodes = vec![
@@ -403,6 +364,138 @@ mod tests {
         create_and_test_join_order!(nodes, edges, BruteForceJoinOrderer {}, optimal_order);
     }
 
+    #[test]
+    fn test_brute_force_order_mock_tpch_sub_q9() {
+        let nodes = vec![
+            ("nation", 25),
+            ("supplier", 100_000),
+            ("part", 100_000),
+            ("partsupp", 8_000_000),
+        ];
+        let name_to_id = node_to_id_map(nodes.clone());
+        let edges = vec![
+            JoinEdge {
+                node1: name_to_id["partsupp"],
+                node1_col_name: "ps_partkey".to_string(),
+                node2: name_to_id["part"],
+                node2_col_name: "p_partkey".to_string(),
+                total_domain: 2_000_000,
+            },
+            JoinEdge {
+                node1: name_to_id["partsupp"],
+                node1_col_name: "ps_suppkey".to_string(),
+                node2: name_to_id["supplier"],
+                node2_col_name: "s_suppkey".to_string(),
+                total_domain: 100_000,
+            },
+            JoinEdge {
+                node1: name_to_id["supplier"],
+                node1_col_name: "s_nationkey".to_string(),
+                node2: name_to_id["nation"],
+                node2_col_name: "n_nationkey".to_string(),
+                total_domain: 25,
+            },
+        ];
+        let optimal_order = test_join(
+            test_join(
+                test_relation(name_to_id["nation"]),
+                test_relation(name_to_id["supplier"]),
+            ),
+            test_join(
+                test_relation(name_to_id["part"]),
+                test_relation(name_to_id["partsupp"]),
+            ),
+        );
+        create_and_test_join_order!(nodes, edges, BruteForceJoinOrderer {}, optimal_order);
+    }
+
+    #[test]
+    fn test_brute_force_order_mock_tpch_q9() {
+        let nodes = vec![
+            ("nation", 22),
+            ("orders", 1_350_000),
+            ("lineitem", 4_374_885),
+            ("supplier", 8_100),
+            ("part", 18_000),
+            ("partsupp", 648_000),
+        ];
+        let name_to_id = node_to_id_map(nodes.clone());
+        let edges = vec![
+            JoinEdge {
+                node1: name_to_id["partsupp"],
+                node1_col_name: "ps_partkey".to_string(),
+                node2: name_to_id["part"],
+                node2_col_name: "p_partkey".to_string(),
+                total_domain: 200_000,
+            },
+            JoinEdge {
+                node1: name_to_id["partsupp"],
+                node1_col_name: "ps_partkey".to_string(),
+                node2: name_to_id["lineitem"],
+                node2_col_name: "l_partkey".to_string(),
+                total_domain: 200_000,
+            },
+            JoinEdge {
+                node1: name_to_id["partsupp"],
+                node1_col_name: "ps_suppkey".to_string(),
+                node2: name_to_id["lineitem"],
+                node2_col_name: "l_suppkey".to_string(),
+                total_domain: 10_000,
+            },
+            JoinEdge {
+                node1: name_to_id["partsupp"],
+                node1_col_name: "ps_suppkey".to_string(),
+                node2: name_to_id["supplier"],
+                node2_col_name: "s_suppkey".to_string(),
+                total_domain: 10_000,
+            },
+            JoinEdge {
+                node1: name_to_id["orders"],
+                node1_col_name: "o_orderkey".to_string(),
+                node2: name_to_id["lineitem"],
+                node2_col_name: "l_orderkey".to_string(),
+                total_domain: 1_500_000,
+            },
+            JoinEdge {
+                node1: name_to_id["lineitem"],
+                node1_col_name: "l_partkey".to_string(),
+                node2: name_to_id["part"],
+                node2_col_name: "p_partkey".to_string(),
+                total_domain: 200_000,
+            },
+            JoinEdge {
+                node1: name_to_id["lineitem"],
+                node1_col_name: "l_suppkey".to_string(),
+                node2: name_to_id["supplier"],
+                node2_col_name: "s_suppkey".to_string(),
+                total_domain: 10_000,
+            },
+            JoinEdge {
+                node1: name_to_id["supplier"],
+                node1_col_name: "s_nationkey".to_string(),
+                node2: name_to_id["nation"],
+                node2_col_name: "n_nationkey".to_string(),
+                total_domain: 25,
+            },
+        ];
+        let optimal_order = test_join(
+            test_relation(name_to_id["orders"]),
+            test_join(
+                test_relation(name_to_id["lineitem"]),
+                test_join(
+                    test_join(
+                        test_relation(name_to_id["nation"]),
+                        test_relation(name_to_id["supplier"]),
+                    ),
+                    test_join(
+                        test_relation(name_to_id["part"]),
+                        test_relation(name_to_id["partsupp"]),
+                    ),
+                ),
+            ),
+        );
+        create_and_test_join_order!(nodes, edges, BruteForceJoinOrderer {}, optimal_order);
+    }
     #[test]
     fn test_brute_force_order_star_schema() {
         let nodes = vec![

diff --git a/src/daft-logical-plan/src/optimization/rules/reorder_joins/join_graph.rs b/src/daft-logical-plan/src/optimization/rules/reorder_joins/join_graph.rs
@@ -251,20 +251,27 @@ impl JoinAdjList {
         }
     }
 
+    // Helper function that estimates the total domain for a join between two relations.
+    fn get_estimated_total_domain(
+        &self,
+        left_plan: &LogicalPlanRef,
+        right_plan: &LogicalPlanRef,
+    ) -> usize {
+        let left_stats = left_plan.materialized_stats();
+        let right_stats = right_plan.materialized_stats();
+        // We multiple the number of rows by the reciprocal of the selectivity to get the original total domain.
+        let left_rows = left_stats.approx_stats.num_rows as f64
+            / left_stats.approx_stats.acc_selectivity.max(0.01);
+        let right_rows = right_stats.approx_stats.num_rows as f64
+            / right_stats.approx_stats.acc_selectivity.max(0.01);
+        left_rows.min(right_rows).max(1.0) as usize
+    }
+
     pub(super) fn add_bidirectional_edge(&mut self, node1: JoinNode, node2: JoinNode) {
         let node1_id = self.get_or_create_plan_id(&node1.plan);
         let node2_id = self.get_or_create_plan_id(&node2.plan);
         // Find the minimal total domain for the join columns, either from the current nodes or from the existing total domains.
-        let mut td = {
-            let node1_stats = node1.plan.materialized_stats();
-            let node2_stats = node2.plan.materialized_stats();
-            // We multiple the number of rows by the reciprocal of the selectivity to get the original total domain.
-            let node1_rows = node1_stats.approx_stats.num_rows as f64
-                / node1_stats.approx_stats.acc_selectivity.max(0.01);
-            let node2_rows = node2_stats.approx_stats.num_rows as f64
-                / node2_stats.approx_stats.acc_selectivity.max(0.01);
-            node1_rows.min(node2_rows).max(1.0) as usize
-        };
+        let mut td = self.get_estimated_total_domain(&node1.plan, &node2.plan);
         if let Some(equivalence_set_id) = self
             .equivalence_set_map
             .get(&(node1_id, node1.relation_name.clone()))
@@ -365,26 +372,93 @@ impl JoinAdjList {
         // Grab the minimum spanning tree of join conditions that connect the left and right trees, i.e. we take at most one join condition
         // from each equivalence set of join conditions.
         let mut conds = vec![];
-        let mut seen_equivalence_set_ids = HashSet::new();
+        let mut added_equivalence_set_id_for_td = HashSet::new();
+        let mut added_equivalence_set_id_for_conds = HashSet::new();
+        let mut double_counted_equivalence_set_ids = HashSet::new();
         let mut td = 1;
         for left_node in left.iter() {
             if let Some(neighbors) = self.edges.get(&left_node) {
                 for right_node in right.iter() {
                     if let Some(edges) = neighbors.get(&right_node) {
-                        for edge in edges {
+                        // When there is only one join condition, we multiply the total domain by the domain of the equivalence set.
+                        // However, when there's more than one join condition between two nodes, then we know that this is not a pk-fk join
+                        // on the join keys. Rather, it's a pk-fk join on the tuple of join keys. So we estimate its total domain as the
+                        // cardinality of the smaller table. In this case as well, we should avoid multiplying the total domain by the
+                        // domains of the equivalence sets. So we use `double_counted_equivalence_set_ids` to keep track of the
+                        // equivalence sets that we should not multiply the total domain by.
+                        //
+                        // For a more concrete example, consider the following join:
+                        //
+                        // part.x = partsupp.x
+                        //
+                        // Assuming |part| < |partsupp|, then the total domain of the join is |part|.
+                        //
+                        // Now consider the following joins:
+                        //
+                        // part.x = partsupp.x
+                        // supp.y = partsupp.y
+                        // lineitem.x = partsupp.x
+                        // lineitem.y = partsupp.y
+                        //
+                        // Note that there are implicit join edges part.x = lineitem.x and supp.y = lineitem.y that we infer.
+                        //
+                        // Assume |supp| < |part| < |partsupp| < |lineitem|.
+                        //
+                        // When joining part and partsupp, we know that the join is a pk-fk join on part.x,
+                        // so the selectivity of the join is 1/|part|.
+                        //
+                        // When joining partsupp and lineitem, we know that the join is a pk-fk join on (partsupp.x, partsupp.y).
+                        // We cannot use the total domains of |supp| or |part| to determine the total domain of (partsupp.x, partsupp.y)
+                        // in partsupp. Instead, we estimate the total domain of |(partsupp.x, partsupp.y)| in partsupp as |partsupp|.
+                        // So the selectivity of the join is 1/|partsupp|.
+                        //
+                        // The same is true when we join (partsupp x part) and lineitem: the total domain of the join is still |partsupp|.
+                        if edges.len() == 1 {
+                            let edge = edges[0].clone();
                             let equivalence_set_id = self
                                 .equivalence_set_map
                                 .get(&(left_node, edge.left_on.clone()))
                                 .expect("Left join condition should be part of an equivalence set");
-                            if seen_equivalence_set_ids.insert(*equivalence_set_id) {
+                            if added_equivalence_set_id_for_td.insert(*equivalence_set_id) {
                                 td *= self.total_domains[*equivalence_set_id];
+                            }
+                            if added_equivalence_set_id_for_conds.insert(*equivalence_set_id) {
                                 conds.push(edge.clone());
                             }
                         }
+                        if edges.len() > 1 {
+                            let node1_plan = self
+                                .id_to_plan
+                                .get(&left_node)
+                                .expect("left id not found in adj list");
+                            let node2_plan = self
+                                .id_to_plan
+                                .get(&right_node)
+                                .expect("right id not found in adj list");
+                            td *= self.get_estimated_total_domain(node1_plan, node2_plan);
+                            for edge in edges {
+                                let equivalence_set_id = self
+                                    .equivalence_set_map
+                                    .get(&(left_node, edge.left_on.clone()))
+                                    .expect(
+                                        "Left join condition should be part of an equivalence set",
+                                    );
+                                if added_equivalence_set_id_for_conds.insert(*equivalence_set_id) {
+                                    conds.push(edge.clone());
+                                }
+                                double_counted_equivalence_set_ids.insert(*equivalence_set_id);
+                            }
+                        }
                     }
                 }
             }
         }
+        for equivalence_set_id in double_counted_equivalence_set_ids {
+            if added_equivalence_set_id_for_td.contains(&equivalence_set_id) {
+                td /= self.total_domains[equivalence_set_id].max(1);
+            }
+        }
+        td = td.max(1);
         (conds, td)
     }
 }