From 58f1cf680a38fddac9f1fb77ce66239646d75822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Fri, 9 Aug 2024 15:23:49 +0800 Subject: [PATCH] [GLUTEN-6388][CH] Support function format (#6716) * support function printf * support function format_string * fix failed uts * fix failed uts * fix failed ut --- .../apache/gluten/utils/CHExpressionUtil.scala | 8 ++++++++ .../execution/GlutenFunctionValidateSuite.scala | 15 +++++++++++++++ .../CommonScalarFunctionParser.cpp | 5 +++-- .../gluten/expression/ExpressionMappings.scala | 1 + .../utils/clickhouse/ClickHouseTestSettings.scala | 1 + .../utils/clickhouse/ClickHouseTestSettings.scala | 1 + .../utils/clickhouse/ClickHouseTestSettings.scala | 1 + .../utils/clickhouse/ClickHouseTestSettings.scala | 1 + .../gluten/expression/ExpressionNames.scala | 1 + 9 files changed, 32 insertions(+), 2 deletions(-) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala index d65de1cea151..ae072b0fbe85 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala @@ -166,6 +166,13 @@ case class ArrayJoinValidator() extends FunctionValidator { } } +case class FormatStringValidator() extends FunctionValidator { + override def doValidate(expr: Expression): Boolean = { + val formatString = expr.asInstanceOf[FormatString] + formatString.children.head.isInstanceOf[Literal] + } +} + object CHExpressionUtil { final val CH_AGGREGATE_FUNC_BLACKLIST: Map[String, FunctionValidator] = Map( @@ -199,6 +206,7 @@ object CHExpressionUtil { SPARK_PARTITION_ID -> DefaultValidator(), URL_DECODE -> DefaultValidator(), URL_ENCODE -> DefaultValidator(), + FORMAT_STRING -> FormatStringValidator(), SKEWNESS -> DefaultValidator(), SOUNDEX -> DefaultValidator(), MAKE_YM_INTERVAL -> DefaultValidator(), diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index 7db5761a284b..45485ac90e1a 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -740,4 +740,19 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS |""".stripMargin runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) } + + test("test function format_string") { + val sql = """ + | SELECT + | format_string( + | 'hello world %d %d %s %f', + | id, + | id, + | CAST(id AS STRING), + | CAST(id AS float) + | ) + |FROM range(10) + |""".stripMargin + runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + } } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp index 726d1683dbff..9c3dc18ec1aa 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp @@ -61,6 +61,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, parseDateTim REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(ToUnixTimestamp, to_unix_timestamp, parseDateTimeInJodaSyntaxOrNull); +// math functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Position, positive, identity); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Negative, negative, negate); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Pmod, pmod, pmod); @@ -107,6 +108,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Rand, rand, randCanonical); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Bin, bin, sparkBin); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Rint, rint, sparkRint); +// string functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Like, like, like); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(NotLike, not_like, notLike); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(StartsWith, starts_with, startsWithUTF8); @@ -130,6 +132,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Initcap, initcap, initcapUTF8); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Conv, conv, sparkConv); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Uuid, uuid, generateUUIDv4); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Levenshtein, levenshtein, editDistanceUTF8); +REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FormatString, format_string, printf); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Crc32, crc32, CRC32); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Murmur3Hash, murmur3hash, sparkMurmurHash3_32); @@ -150,7 +153,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FloorDatetime, floor_datetime, dateTrunc) REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Floor, floor, sparkFloor); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MothsBetween, months_between, sparkMonthsBetween); - // array functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Array, array, array); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Shuffle, shuffle, arrayShuffle); @@ -165,7 +167,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MapKeys, map_keys, mapKeys); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MapValues, map_values, mapValues); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MapFromArrays, map_from_arrays, mapFromArrays); - // json functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FlattenJsonStringOnRequired, flattenJSONStringOnRequired, flattenJSONStringOnRequired); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(ToJson, to_json, toJSONString); diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala index ebf0c5139245..e0628f11102d 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala @@ -105,6 +105,7 @@ object ExpressionMappings { Sig[Levenshtein](LEVENSHTEIN), Sig[UnBase64](UNBASE64), Sig[Base64](BASE64), + Sig[FormatString](FORMAT_STRING), // URL functions Sig[ParseUrl](PARSE_URL), diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 8fd68d5170b9..5c2833de4bc0 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -912,6 +912,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("REPEAT") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") + .exclude("FORMAT") // refer https://github.com/apache/incubator-gluten/issues/6765 .excludeGlutenTest("SPARK-40213: ascii for Latin-1 Supplement characters") enableSuite[GlutenTryCastSuite] .exclude("null cast") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index f69598adf555..c8e162e61d66 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -871,6 +871,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("REPEAT") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") + .exclude("FORMAT") // refer https://github.com/apache/incubator-gluten/issues/6765 enableSuite[GlutenTryCastSuite] .exclude("null cast") .exclude("cast string to date") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index ab288e835b12..77c12621efeb 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -775,6 +775,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("REPEAT") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") + .exclude("FORMAT") // refer https://github.com/apache/incubator-gluten/issues/6765 enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSuiteV1Filter] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index ab288e835b12..77c12621efeb 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -775,6 +775,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("REPEAT") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") + .exclude("FORMAT") // refer https://github.com/apache/incubator-gluten/issues/6765 enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSuiteV1Filter] diff --git a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala index d47dbc4cc1fa..96a615615179 100644 --- a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala +++ b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala @@ -132,6 +132,7 @@ object ExpressionNames { final val UNBASE64 = "unbase64" final val BASE64 = "base64" final val MASK = "mask" + final val FORMAT_STRING = "format_string" // URL functions final val PARSE_URL = "parse_url"