[GLUTEN-1874][CH] Fixes nullable mismatch in union (apache#1901)

Fixes nullable mismatch in union for CH
baibaichen · Jul 14, 2023 · d942ee1 · d942ee1
1 parent e05462d
commit d942ee1
Show file tree

Hide file tree

Showing 14 changed files with 157 additions and 65 deletions.
diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHBackend.scala b/backends-clickhouse/src/main/scala/io/glutenproject/backendsapi/clickhouse/CHBackend.scala
@@ -155,4 +155,5 @@ object CHBackendSettings extends BackendSettingsApi with Logging {
     GlutenConfig.GLUTEN_CONFIG_PREFIX + GlutenConfig.GLUTEN_CLICKHOUSE_BACKEND
 
   override def shuffleSupportedCodec(): Set[String] = GLUTNE_CLICKHOUSE_SHUFFLE_SUPPORTED_CODEC
+  override def needOutputSchemaForPlan(): Boolean = true
 }
diff --git a/backends-clickhouse/src/main/scala/io/glutenproject/execution/CHFilterExecTransformer.scala b/backends-clickhouse/src/main/scala/io/glutenproject/execution/CHFilterExecTransformer.scala
@@ -75,7 +75,9 @@ case class CHFilterExecTransformer(condition: Expression, child: SparkPlan)
     if (leftCondition == null) {
       // The computing for this filter is not needed.
       context.registerEmptyRelToOperator(operatorId)
-      return childCtx
+      // Since some columns' nullability will be removed after this filter, we need to update the
+      // outputAttributes of child context.
+      TransformContext(childCtx.inputAttributes, output, childCtx.root)
     }
 
     val currRel = if (childCtx != null) {

diff --git a/...ickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala b/...ickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala
@@ -1438,6 +1438,41 @@ class GlutenClickHouseTPCHParquetSuite extends GlutenClickHouseTPCHAbstractSuite
     compareResultsAgainstVanillaSpark(sql5, true, { _ => }, false)
   }
 
+  test("GLUTEN-1874 not null in one stream") {
+    val sql =
+      """
+        |select n_regionkey from (
+        | select *, row_number() over (partition by n_regionkey order by is_new) as rank from(
+        |   select n_regionkey, 0 as is_new from nation where n_regionkey is not null
+        |   union all
+        |   select n_regionkey, 1 as is_new from (
+        |     select n_regionkey,
+        |       row_number() over (partition by n_regionkey order by n_nationkey) as rn from nation
+        |   ) t0 where rn = 1
+        | ) t1
+        |) t2 where rank = 1
+    """.stripMargin
+    compareResultsAgainstVanillaSpark(sql, true, { _ => })
+  }
+
+  test("GLUTEN-1874 not null in both streams") {
+    val sql =
+      """
+        |select n_regionkey from (
+        | select *, row_number() over (partition by n_regionkey order by is_new) as rank from(
+        |   select n_regionkey, 0 as is_new from nation where n_regionkey is not null
+        |   union all
+        |   select n_regionkey, 1 as is_new from (
+        |     select n_regionkey,
+        |       row_number() over (partition by n_regionkey order by n_nationkey) as rn
+        |     from nation where n_regionkey is not null
+        |   ) t0 where rn = 1
+        | ) t1
+        |) t2 where rank = 1
+    """.stripMargin
+    compareResultsAgainstVanillaSpark(sql, true, { _ => })
+  }
+
   test("GLUTEN-2095: test cast(string as binary)") {
     runQueryAndCompare(
       "select cast(n_nationkey as binary), cast(n_comment as binary) from nation"

diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
@@ -1,4 +1,5 @@
 #include "SerializedPlanParser.h"
+#include <algorithm>
 #include <memory>
 #include <string_view>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
@@ -366,7 +367,7 @@ DB::QueryPlanPtr SerializedPlanParser::parseMergeTreeTable(const substrait::Read
     {
         auto input_header = query->getCurrentDataStream().header;
         std::erase_if(non_nullable_columns, [input_header](auto item) -> bool { return !input_header.has(item); });
-        auto* remove_null_step = addRemoveNullableStep(*query, non_nullable_columns);
+        auto * remove_null_step = addRemoveNullableStep(*query, non_nullable_columns);
         if (remove_null_step)
         {
             steps.emplace_back(remove_null_step);
@@ -396,7 +397,7 @@ SerializedPlanParser::parsePreWhereInfo(const substrait::Expression & rel, Block
     prewhere_info->need_filter = true;
     prewhere_info->remove_prewhere_column = true;
     auto cols = prewhere_info->prewhere_actions->getRequiredColumnsNames();
-     // Keep it the same as the input.
+    // Keep it the same as the input.
     prewhere_info->prewhere_actions->removeUnusedActions(Names{filter_name}, false, true);
     prewhere_info->prewhere_actions->projectInput(false);
     for (const auto & name : input.getNames())
@@ -458,6 +459,51 @@ QueryPlanPtr SerializedPlanParser::parse(std::unique_ptr<substrait::Plan> plan)
             expression_step->setStepDescription("Rename Output");
             query_plan->addStep(std::move(expression_step));
         }
+
+        // fixes: issue-1874, to keep the nullability as expected.
+        const auto & output_schema = root_rel.root().output_schema();
+        if (output_schema.types_size())
+        {
+            auto original_header = query_plan->getCurrentDataStream().header;
+            const auto & original_cols = original_header.getColumnsWithTypeAndName();
+            if (static_cast<size_t>(output_schema.types_size()) != original_cols.size())
+            {
+                throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Mismatch output schema");
+            }
+            bool need_final_project = false;
+            DB::ColumnsWithTypeAndName final_cols;
+            for (int i = 0; i < output_schema.types_size(); ++i)
+            {
+                const auto & col = original_cols[i];
+                auto type = TypeParser::parseType(output_schema.types(i));
+                // At present, we only check nullable mismatch.
+                // intermediate aggregate data is special, no check here.
+                if (type->isNullable() != col.type->isNullable() && !typeid_cast<const DB::DataTypeAggregateFunction*>(col.type.get()))
+                {
+                    if (type->isNullable())
+                    {
+                        final_cols.emplace_back(type->createColumn(), std::make_shared<DB::DataTypeNullable>(col.type), col.name);
+                    }
+                    else
+                    {
+                        final_cols.emplace_back(type->createColumn(), DB::removeNullable(col.type), col.name);
+                    }
+                    need_final_project = true;
+                }
+                else
+                {
+                    final_cols.push_back(col);
+                }
+            }
+            if (need_final_project)
+            {
+                ActionsDAGPtr final_project
+                    = ActionsDAG::makeConvertingActions(original_cols, final_cols, ActionsDAG::MatchColumnsMode::Position);
+                QueryPlanStepPtr final_project_step = std::make_unique<ExpressionStep>(query_plan->getCurrentDataStream(), final_project);
+                final_project_step->setStepDescription("Project for output schema");
+                query_plan->addStep(std::move(final_project_step));
+            }
+        }
         return query_plan;
     }
     else
@@ -2591,5 +2637,4 @@ void NonNullableColumnsResolver::visitNonNullable(const substrait::Expression &
     }
     // else, do nothing.
 }
-
 }
diff --git a/gluten-core/src/main/java/io/glutenproject/substrait/plan/PlanBuilder.java b/gluten-core/src/main/java/io/glutenproject/substrait/plan/PlanBuilder.java
@@ -22,6 +22,7 @@
 import io.glutenproject.substrait.extensions.ExtensionBuilder;
 import io.glutenproject.substrait.extensions.FunctionMappingNode;
 import io.glutenproject.substrait.rel.RelNode;
+import io.glutenproject.substrait.type.TypeNode;
 
 import java.util.ArrayList;
 import java.util.Map;
@@ -36,12 +37,29 @@ public static PlanNode makePlan(ArrayList<FunctionMappingNode> mappingNodes,
     return new PlanNode(mappingNodes, relNodes, outNames);
   }
 
+  public static PlanNode makePlan(ArrayList<FunctionMappingNode> mappingNodes,
+                                  ArrayList<RelNode> relNodes,
+                                  ArrayList<String> outNames,
+                                  TypeNode outputSchema,
+                                  AdvancedExtensionNode extension) {
+    return new PlanNode(mappingNodes, relNodes, outNames, outputSchema, extension);
+  }
+
   public static PlanNode makePlan(AdvancedExtensionNode extension) {
     return new PlanNode(extension);
   }
 
-  public static PlanNode makePlan(SubstraitContext subCtx, ArrayList<RelNode> relNodes,
+  public static PlanNode makePlan(SubstraitContext subCtx,
+                                  ArrayList<RelNode> relNodes,
                                   ArrayList<String> outNames) {
+    return makePlan(subCtx, relNodes, outNames, null, null);
+  }
+
+  public static PlanNode makePlan(SubstraitContext subCtx,
+                                  ArrayList<RelNode> relNodes,
+                                  ArrayList<String> outNames,
+                                  TypeNode outputSchema,
+                                  AdvancedExtensionNode extension) {
     if (subCtx == null) {
       throw new NullPointerException("ColumnarWholestageTransformer cannot doTansform.");
     }
@@ -52,6 +70,9 @@ public static PlanNode makePlan(SubstraitContext subCtx, ArrayList<RelNode> relN
           ExtensionBuilder.makeFunctionMapping(entry.getKey(), entry.getValue());
       mappingNodes.add(mappingNode);
     }
+    if (extension != null || outputSchema != null) {
+      return makePlan(mappingNodes, relNodes, outNames, outputSchema, extension);
+    }
     return makePlan(mappingNodes, relNodes, outNames);
   }
 

diff --git a/gluten-core/src/main/java/io/glutenproject/substrait/plan/PlanNode.java b/gluten-core/src/main/java/io/glutenproject/substrait/plan/PlanNode.java
@@ -20,6 +20,7 @@
 import io.glutenproject.substrait.extensions.AdvancedExtensionNode;
 import io.glutenproject.substrait.extensions.FunctionMappingNode;
 import io.glutenproject.substrait.rel.RelNode;
+import io.glutenproject.substrait.type.TypeNode;
 import io.substrait.proto.Plan;
 import io.substrait.proto.PlanRel;
 import io.substrait.proto.RelRoot;
@@ -32,6 +33,7 @@ public class PlanNode implements Serializable {
   private final ArrayList<RelNode> relNodes = new ArrayList<>();
   private final ArrayList<String> outNames = new ArrayList<>();
 
+  private TypeNode outputSchema = null;
   private AdvancedExtensionNode extension = null;
 
   PlanNode(ArrayList<FunctionMappingNode> mappingNodes,
@@ -42,6 +44,18 @@ public class PlanNode implements Serializable {
     this.outNames.addAll(outNames);
   }
 
+  PlanNode(ArrayList<FunctionMappingNode> mappingNodes,
+           ArrayList<RelNode> relNodes,
+           ArrayList<String> outNames,
+           TypeNode outputSchema,
+           AdvancedExtensionNode extension) {
+    this.mappingNodes.addAll(mappingNodes);
+    this.relNodes.addAll(relNodes);
+    this.outNames.addAll(outNames);
+    this.outputSchema = outputSchema;
+    this.extension = extension;
+  }
+
   PlanNode(AdvancedExtensionNode extension) {
     this.extension = extension;
   }
@@ -61,6 +75,9 @@ public Plan toProtobuf() {
       for (String name : outNames) {
         relRootBuilder.addNames(name);
       }
+      if (outputSchema != null) {
+        relRootBuilder.setOutputSchema(outputSchema.toProtobuf().getStruct());
+      }
       planRelBuilder.setRoot(relRootBuilder.build());
 
       planBuilder.addRelations(planRelBuilder.build());

diff --git a/gluten-core/src/main/resources/substrait/proto/substrait/algebra.proto b/gluten-core/src/main/resources/substrait/proto/substrait/algebra.proto
@@ -406,6 +406,7 @@ message RelRoot {
   Rel input = 1;
   // Field names in depth-first order
   repeated string names = 2;
+  Type.Struct output_schema = 3;
 }
 
 // A relation (used internally in a plan)

diff --git a/gluten-core/src/main/scala/io/glutenproject/backendsapi/BackendSettingsApi.scala b/gluten-core/src/main/scala/io/glutenproject/backendsapi/BackendSettingsApi.scala
@@ -87,4 +87,6 @@ trait BackendSettingsApi {
   def rescaleDecimalIntegralExpression(): Boolean = false
 
   def shuffleSupportedCodec(): Set[String]
+
+  def needOutputSchemaForPlan(): Boolean = false
 }
diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/BasicPhysicalOperatorTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/BasicPhysicalOperatorTransformer.scala
@@ -31,7 +31,7 @@ import io.glutenproject.substrait.expression.ExpressionNode
 import io.glutenproject.substrait.extensions.ExtensionBuilder
 import io.glutenproject.substrait.plan.PlanBuilder
 import io.glutenproject.substrait.rel.{RelBuilder, RelNode}
-import io.glutenproject.utils.BindReferencesUtil
+
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
@@ -309,8 +309,7 @@ case class ProjectExecTransformer(projectList: Seq[NamedExpression],
     }
     assert(currRel != null, "Project Rel should be valid")
 
-    val outputAttrs = BindReferencesUtil.bindReferencesWithNullable(output, inputAttributes)
-    TransformContext(inputAttributes, outputAttrs, currRel)
+    TransformContext(inputAttributes, output, currRel)
   }
 
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)

diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/ExpandExecTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/ExpandExecTransformer.scala
@@ -30,7 +30,6 @@ import io.glutenproject.substrait.expression.{ExpressionBuilder, ExpressionNode}
 import io.glutenproject.substrait.extensions.ExtensionBuilder
 import io.glutenproject.substrait.plan.PlanBuilder
 import io.glutenproject.substrait.rel.{RelBuilder, RelNode}
-import io.glutenproject.utils.BindReferencesUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -269,8 +268,7 @@ case class ExpandExecTransformer(projections: Seq[Seq[Expression]],
         child.output)
     }
     assert(currRel != null, "Expand Rel should be valid")
-    val outputAttrs = BindReferencesUtil.bindReferencesWithNullable(output, inputAttributes)
-    TransformContext(inputAttributes, outputAttrs, currRel)
+    TransformContext(inputAttributes, output, currRel)
   }
 
   protected override def doExecute(): RDD[InternalRow] =

diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/SortExecTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/SortExecTransformer.scala
@@ -30,7 +30,6 @@ import io.glutenproject.extension.ValidationResult
 import io.glutenproject.substrait.`type`.{TypeBuilder, TypeNode}
 import io.glutenproject.substrait.extensions.ExtensionBuilder
 import io.glutenproject.substrait.plan.PlanBuilder
-import io.glutenproject.utils.BindReferencesUtil
 import io.substrait.proto.SortField
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -296,8 +295,7 @@ case class SortExecTransformer(sortOrder: Seq[SortOrder],
         child.output)
     }
     assert(currRel != null, "Sort Rel should be valid")
-    val outputAttrs = BindReferencesUtil.bindReferencesWithNullable(output, inputAttributes)
-    TransformContext(inputAttributes, outputAttrs, currRel)
+    TransformContext(inputAttributes, output, currRel)
   }
 
   override def doExecuteColumnar(): RDD[ColumnarBatch] = {

diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/WholeStageTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/WholeStageTransformer.scala
@@ -18,15 +18,19 @@
 package io.glutenproject.execution
 
 import com.google.common.collect.Lists
+
 import io.glutenproject.GlutenConfig
 import io.glutenproject.backendsapi.BackendsApiManager
 import io.glutenproject.expression._
 import io.glutenproject.extension.GlutenPlan
 import io.glutenproject.metrics.{MetricsUpdater, NoopMetricsUpdater}
+import io.glutenproject.substrait.`type`.{TypeBuilder, TypeNode}
 import io.glutenproject.substrait.SubstraitContext
 import io.glutenproject.substrait.plan.{PlanBuilder, PlanNode}
 import io.glutenproject.substrait.rel.RelNode
+import io.glutenproject.substrait.`type`.{TypeBuilder, TypeNode}
 import io.glutenproject.utils.SubstraitPlanPrinterUtil
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
@@ -164,12 +168,26 @@ case class WholeStageTransformer(child: SparkPlan)(val transformStageId: Int)
       throw new NullPointerException(s"ColumnarWholestageTransformer can't doTansform on $child")
     }
     val outNames = new java.util.ArrayList[String]()
-    for (attr <- childCtx.outputAttributes) {
-      outNames.add(ConverterUtils.genColumnNameWithExprId(attr))
-    }
-    val planNode =
-      PlanBuilder.makePlan(substraitContext, Lists.newArrayList(childCtx.root), outNames)
+    val planNode = if (BackendsApiManager.getSettings.needOutputSchemaForPlan()) {
+      val outputTypeNodeList = new java.util.ArrayList[TypeNode]()
+      for (attr <- childCtx.outputAttributes) {
+        outNames.add(ConverterUtils.genColumnNameWithExprId(attr))
+        outputTypeNodeList.add(ConverterUtils.getTypeNode(attr.dataType, attr.nullable))
+      }
 
+      // Fixes issue-1874
+      val outputSchema = TypeBuilder.makeStruct(false, outputTypeNodeList)
+      PlanBuilder.makePlan(substraitContext,
+        Lists.newArrayList(childCtx.root),
+        outNames,
+        outputSchema,
+        null)
+    } else {
+      for (attr <- childCtx.outputAttributes) {
+        outNames.add(ConverterUtils.genColumnNameWithExprId(attr))
+      }
+      PlanBuilder.makePlan(substraitContext, Lists.newArrayList(childCtx.root), outNames)
+    }
     planJson = SubstraitPlanPrinterUtil.substraitPlanToJson(planNode.toProtobuf)
 
     WholestageTransformContext(

diff --git a/gluten-core/src/main/scala/io/glutenproject/execution/WindowExecTransformer.scala b/gluten-core/src/main/scala/io/glutenproject/execution/WindowExecTransformer.scala
@@ -31,7 +31,6 @@ import io.glutenproject.substrait.expression.{ExpressionNode, WindowFunctionNode
 import io.glutenproject.substrait.extensions.ExtensionBuilder
 import io.glutenproject.substrait.plan.PlanBuilder
 import io.glutenproject.substrait.rel.{RelBuilder, RelNode}
-import io.glutenproject.utils.BindReferencesUtil
 import io.substrait.proto.SortField
 
 import org.apache.spark.rdd.RDD
@@ -232,8 +231,7 @@ case class WindowExecTransformer(windowExpression: Seq[NamedExpression],
         child.output)
     }
     assert(currRel != null, "Window Rel should be valid")
-    val outputAttrs = BindReferencesUtil.bindReferencesWithNullable(output, inputAttributes)
-    TransformContext(inputAttributes, outputAttrs, currRel)
+    TransformContext(inputAttributes, output, currRel)
   }
 
   override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {