Support map_from_array

apache · Aug 10, 2024 · c16df96 · c16df96
1 parent 920cfaf
commit c16df96
Show file tree

Hide file tree

Showing 11 changed files with 167 additions and 0 deletions.
diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala
@@ -2074,6 +2074,28 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla
   test("Deduplicate sorting keys") {
     runQueryAndCompare("select * from lineitem order by l_orderkey, l_orderkey") {
       checkGlutenOperatorMatch[SortExecTransformer]
+  }
+
+  test("test map_from_arrays") {
+    withTempView("t") {
+      Seq((Seq(1, 2, 1), Seq("a", "b", "c"))).toDF("k", "v").createOrReplaceTempView("t")
+      withSQLConf(SQLConf.MAP_KEY_DEDUP_POLICY.key -> SQLConf.MapKeyDedupPolicy.LAST_WIN.toString) {
+        runQueryAndCompare(
+          """
+            |select map_from_arrays(k, v) from t
+            |""".stripMargin
+        ) {
+          checkGlutenOperatorMatch[ProjectExecTransformer]
+        }
+      }
+
+      withSQLConf(
+        SQLConf.MAP_KEY_DEDUP_POLICY.key -> SQLConf.MapKeyDedupPolicy.EXCEPTION.toString) {
+        val msg = intercept[Exception] {
+          spark.sql("select map_from_arrays(k, v) from t").collect()
+        }.getMessage
+        assert(msg.contains("Duplicate map keys (1) are not allowed"))
+      }
     }
   }
 }
diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h
@@ -36,6 +36,8 @@ const std::string kAllowPrecisionLoss = "spark.sql.decimalOperations.allowPrecis
 
 const std::string kIgnoreMissingFiles = "spark.sql.files.ignoreMissingFiles";
 
+const std::string kMapKeyDedupPolicy = "spark.sql.mapKeyDedupPolicy";
+
 const std::string kDefaultSessionTimezone = "spark.gluten.sql.session.timeZone.default";
 
 const std::string kSparkOffHeapMemory = "spark.gluten.memory.offHeap.size.in.bytes";

diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -512,6 +512,9 @@ std::unordered_map<std::string, std::string> WholeStageResultIterator::getQueryC
 
     configs[velox::core::QueryConfig::kSparkPartitionId] = std::to_string(taskInfo_.partitionId);
 
+    configs[velox::core::QueryConfig::kSparkMapKeyDedupPolicy] =
+        veloxCfg_->get<std::string>(kMapKeyDedupPolicy, "EXCEPTION");
+
   } catch (const std::invalid_argument& err) {
     std::string errDetails = err.what();
     throw std::runtime_error("Invalid conf arg: " + errDetails);

diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -273,6 +273,8 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("aggregate function - array for non-primitive type")
     // Rewrite this test because Velox sorts rows by key for primitive data types, which disrupts the original row sequence.
     .exclude("map_zip_with function - map of primitive types")
+    // Rewrite this test because Velox's exception message is different with vanilla spark.
+    .exclude("map with arrays")
   enableSuite[GlutenDataFrameTungstenSuite]
   enableSuite[GlutenDataFrameSetOperationsSuite]
     // Result depends on the implementation for nondeterministic expression rand.

diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
 
 class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenSQLTestsTrait {
   import testImplicits._
@@ -131,4 +132,36 @@ class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenS
       oneRowDF.selectExpr("flatten(null)")
     }
   }
+
+  testGluten("map with arrays") {
+    val df1 = Seq((Seq(1, 2), Seq("a", "b"))).toDF("k", "v")
+    val expectedType = MapType(IntegerType, StringType, valueContainsNull = true)
+    val row = df1.select(map_from_arrays($"k", $"v")).first()
+    assert(row.schema(0).dataType === expectedType)
+    assert(row.getMap[Int, String](0) === Map(1 -> "a", 2 -> "b"))
+    checkAnswer(df1.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> "a", 2 -> "b"))))
+
+    val df2 = Seq((Seq(1, 2), Seq(null, "b"))).toDF("k", "v")
+    checkAnswer(df2.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> null, 2 -> "b"))))
+
+    val df3 = Seq((null, null)).toDF("k", "v")
+    checkAnswer(df3.select(map_from_arrays($"k", $"v")), Seq(Row(null)))
+
+    val df4 = Seq((1, "a")).toDF("k", "v")
+    intercept[AnalysisException] {
+      df4.select(map_from_arrays($"k", $"v"))
+    }
+
+    val df5 = Seq((Seq("a", null), Seq(1, 2))).toDF("k", "v")
+    val msg1 = intercept[Exception] {
+      df5.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg1.contains("map key cannot be null"))
+
+    val df6 = Seq((Seq(1, 2), Seq("a"))).toDF("k", "v")
+    val msg2 = intercept[Exception] {
+      df6.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg2.contains("Key and value arrays must be the same length"))
+  }
 }
diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -989,6 +989,8 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("aggregate function - array for non-primitive type")
     // Rewrite this test because Velox sorts rows by key for primitive data types, which disrupts the original row sequence.
     .exclude("map_zip_with function - map of primitive types")
+    // Rewrite this test because Velox's exception message is different with vanilla spark.
+    .exclude("map with arrays")
   enableSuite[GlutenDataFrameHintSuite]
   enableSuite[GlutenDataFrameImplicitsSuite]
   enableSuite[GlutenDataFrameJoinSuite]

diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
 
 class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenSQLTestsTrait {
   import testImplicits._
@@ -131,4 +132,36 @@ class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenS
       oneRowDF.selectExpr("flatten(null)")
     }
   }
+
+  testGluten("map with arrays") {
+    val df1 = Seq((Seq(1, 2), Seq("a", "b"))).toDF("k", "v")
+    val expectedType = MapType(IntegerType, StringType, valueContainsNull = true)
+    val row = df1.select(map_from_arrays($"k", $"v")).first()
+    assert(row.schema(0).dataType === expectedType)
+    assert(row.getMap[Int, String](0) === Map(1 -> "a", 2 -> "b"))
+    checkAnswer(df1.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> "a", 2 -> "b"))))
+
+    val df2 = Seq((Seq(1, 2), Seq(null, "b"))).toDF("k", "v")
+    checkAnswer(df2.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> null, 2 -> "b"))))
+
+    val df3 = Seq((null, null)).toDF("k", "v")
+    checkAnswer(df3.select(map_from_arrays($"k", $"v")), Seq(Row(null)))
+
+    val df4 = Seq((1, "a")).toDF("k", "v")
+    intercept[AnalysisException] {
+      df4.select(map_from_arrays($"k", $"v"))
+    }
+
+    val df5 = Seq((Seq("a", null), Seq(1, 2))).toDF("k", "v")
+    val msg1 = intercept[Exception] {
+      df5.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg1.contains("map key cannot be null"))
+
+    val df6 = Seq((Seq(1, 2), Seq("a"))).toDF("k", "v")
+    val msg2 = intercept[Exception] {
+      df6.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg2.contains("Key and value arrays must be the same length"))
+  }
 }
diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -998,6 +998,8 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("aggregate function - array for non-primitive type")
     // Rewrite this test because Velox sorts rows by key for primitive data types, which disrupts the original row sequence.
     .exclude("map_zip_with function - map of primitive types")
+    // Rewrite this test because Velox's exception message is different with vanilla spark.
+    .exclude("map with arrays")
   enableSuite[GlutenDataFrameHintSuite]
   enableSuite[GlutenDataFrameImplicitsSuite]
   enableSuite[GlutenDataFrameJoinSuite]

diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
 
 class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenSQLTestsTrait {
   import testImplicits._
@@ -49,4 +50,36 @@ class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenS
       false
     )
   }
+
+  testGluten("map with arrays") {
+    val df1 = Seq((Seq(1, 2), Seq("a", "b"))).toDF("k", "v")
+    val expectedType = MapType(IntegerType, StringType, valueContainsNull = true)
+    val row = df1.select(map_from_arrays($"k", $"v")).first()
+    assert(row.schema(0).dataType === expectedType)
+    assert(row.getMap[Int, String](0) === Map(1 -> "a", 2 -> "b"))
+    checkAnswer(df1.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> "a", 2 -> "b"))))
+
+    val df2 = Seq((Seq(1, 2), Seq(null, "b"))).toDF("k", "v")
+    checkAnswer(df2.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> null, 2 -> "b"))))
+
+    val df3 = Seq((null, null)).toDF("k", "v")
+    checkAnswer(df3.select(map_from_arrays($"k", $"v")), Seq(Row(null)))
+
+    val df4 = Seq((1, "a")).toDF("k", "v")
+    intercept[AnalysisException] {
+      df4.select(map_from_arrays($"k", $"v"))
+    }
+
+    val df5 = Seq((Seq("a", null), Seq(1, 2))).toDF("k", "v")
+    val msg1 = intercept[Exception] {
+      df5.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg1.contains("map key cannot be null"))
+
+    val df6 = Seq((Seq(1, 2), Seq("a"))).toDF("k", "v")
+    val msg2 = intercept[Exception] {
+      df6.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg2.contains("Key and value arrays must be the same length"))
+  }
 }
diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -1014,6 +1014,8 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("aggregate function - array for non-primitive type")
     // Rewrite this test because Velox sorts rows by key for primitive data types, which disrupts the original row sequence.
     .exclude("map_zip_with function - map of primitive types")
+    // Rewrite this test because Velox's exception message is different with vanilla spark.
+    .exclude("map with arrays")
   enableSuite[GlutenDataFrameHintSuite]
   enableSuite[GlutenDataFrameImplicitsSuite]
   enableSuite[GlutenDataFrameJoinSuite]

diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDataFrameFunctionsSuite.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
 
 class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenSQLTestsTrait {
   import testImplicits._
@@ -49,4 +50,36 @@ class GlutenDataFrameFunctionsSuite extends DataFrameFunctionsSuite with GlutenS
       false
     )
   }
+
+  testGluten("map with arrays") {
+    val df1 = Seq((Seq(1, 2), Seq("a", "b"))).toDF("k", "v")
+    val expectedType = MapType(IntegerType, StringType, valueContainsNull = true)
+    val row = df1.select(map_from_arrays($"k", $"v")).first()
+    assert(row.schema(0).dataType === expectedType)
+    assert(row.getMap[Int, String](0) === Map(1 -> "a", 2 -> "b"))
+    checkAnswer(df1.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> "a", 2 -> "b"))))
+
+    val df2 = Seq((Seq(1, 2), Seq(null, "b"))).toDF("k", "v")
+    checkAnswer(df2.select(map_from_arrays($"k", $"v")), Seq(Row(Map(1 -> null, 2 -> "b"))))
+
+    val df3 = Seq((null, null)).toDF("k", "v")
+    checkAnswer(df3.select(map_from_arrays($"k", $"v")), Seq(Row(null)))
+
+    val df4 = Seq((1, "a")).toDF("k", "v")
+    intercept[AnalysisException] {
+      df4.select(map_from_arrays($"k", $"v"))
+    }
+
+    val df5 = Seq((Seq("a", null), Seq(1, 2))).toDF("k", "v")
+    val msg1 = intercept[Exception] {
+      df5.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg1.contains("map key cannot be null"))
+
+    val df6 = Seq((Seq(1, 2), Seq("a"))).toDF("k", "v")
+    val msg2 = intercept[Exception] {
+      df6.select(map_from_arrays($"k", $"v")).collect
+    }.getMessage
+    assert(msg2.contains("Key and value arrays must be the same length"))
+  }
 }