From 0f19a86320e8d6eea670951113e997dfd4281a64 Mon Sep 17 00:00:00 2001 From: KevinyhZou <37431499+KevinyhZou@users.noreply.github.com> Date: Mon, 11 Mar 2024 18:35:00 +0800 Subject: [PATCH] [GLUTEN-4898][CH]Bug fix to date diff (#4900) What changes were proposed in this pull request? (Please fill in changes proposed in this fix) (Fixes: #4898) Fix diff problem of to_date function; Fix exception throws from parseDateTimeBestEffort of SparkFunctionToDateTime; simplify substring function code How was this patch tested? TEST BY UT --- .../GlutenClickHouseTPCHParquetSuite.scala | 5 +++- .../Functions/SparkFunctionToDate.cpp | 30 +++++++++---------- .../Functions/SparkFunctionToDateTime.h | 4 +-- .../scalar_function_parser/substring.cpp | 5 ++-- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala index 1c44c2de0191..9ffde6b5c90e 100644 --- a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala @@ -2103,7 +2103,10 @@ class GlutenClickHouseTPCHParquetSuite extends GlutenClickHouseTPCHAbstractSuite |(4, '2023-09-02 00:00:01.333-11'), |(5, ' 2023-09-02 agdfegfew'), |(6, 'afe2023-09-02 11:22:33'), - |(7, '1970-01-01 00:00:00') + |(7, '1970-01-01 00:00:00'), + |(8, '2024-3-2'), + |(9, '2024-03-2'), + |(10, '2024-03') |""".stripMargin spark.sql(create_table_sql) spark.sql(insert_data_sql) diff --git a/cpp-ch/local-engine/Functions/SparkFunctionToDate.cpp b/cpp-ch/local-engine/Functions/SparkFunctionToDate.cpp index cef3d14522b9..0b963e769efd 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionToDate.cpp +++ b/cpp-ch/local-engine/Functions/SparkFunctionToDate.cpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace DB { @@ -50,7 +51,9 @@ class SparkFunctionConvertToDate : public DB::FunctionToDate32OrNull for (size_t i = start; i < start + length; ++i) { if (!isNumericASCII(*(rb.position() + i))) + { return false; + } } return true; }; @@ -63,7 +66,7 @@ class SparkFunctionConvertToDate : public DB::FunctionToDate32OrNull }; if (!checkNumbericASCII(buf, 0, 4) || !checkDelimiter(buf, 4) - || !checkNumbericASCII(buf, 5, 2) + || !checkNumbericASCII(buf, 5, 2) || !checkDelimiter(buf, 7) || !checkNumbericASCII(buf, 8, 2)) return false; @@ -112,19 +115,19 @@ class SparkFunctionConvertToDate : public DB::FunctionToDate32OrNull throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s return type must be date32.", name); using ColVecTo = DB::DataTypeDate32::ColumnType; - typename ColVecTo::MutablePtr result_column = ColVecTo::create(size); + typename ColVecTo::MutablePtr result_column = ColVecTo::create(size, 0); typename ColVecTo::Container & result_container = result_column->getData(); - DB::ColumnUInt8::MutablePtr null_map = DB::ColumnUInt8::create(size); + DB::ColumnUInt8::MutablePtr null_map = DB::ColumnUInt8::create(size, 0); typename DB::ColumnUInt8::Container & null_container = null_map->getData(); - const DateLUTImpl * time_zone = &DateLUT::instance(); + const DateLUTImpl * local_time_zone = &DateLUT::instance(); + const DateLUTImpl * utc_time_zone = &DateLUT::instance("UTC"); for (size_t i = 0; i < size; ++i) { auto str = src_col->getDataAt(i); - if (str.size < 10) + if (str.size < 4) { null_container[i] = true; - result_container[i] = 0; continue; } else @@ -134,20 +137,17 @@ class SparkFunctionConvertToDate : public DB::FunctionToDate32OrNull { buf.position() ++; } - if(buf.buffer().end() - buf.position() < 10) + if(buf.buffer().end() - buf.position() < 4) { null_container[i] = true; - result_container[i] = 0; continue; } - if (!checkAndGetDate32(buf, result_container[i], *time_zone)) - { - null_container[i] = true; - result_container[i] = 0; - } - else + if (!checkAndGetDate32(buf, result_container[i], *local_time_zone)) { - null_container[i] = false; + time_t tmp = 0; + bool parsed = tryParseDateTimeBestEffort(tmp, buf, *local_time_zone, *utc_time_zone); + result_container[i] = local_time_zone->toDayNum(tmp); + null_container[i] = !parsed; } } } diff --git a/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h b/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h index 760a4da75042..d185b850fa1f 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h @@ -163,8 +163,8 @@ class SparkFunctionConvertToDateTime : public DB::FunctionToDateTime64OrNull } else { - parseDateTime64BestEffort(dst_data[i], scale, buf, *local_time_zone, *utc_time_zone); - null_map_data[i] = 0; + bool parsed = tryParseDateTime64BestEffort(dst_data[i], scale, buf, *local_time_zone, *utc_time_zone); + null_map_data[i] = !parsed; } } } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp index f4c21f4ad7fa..2a44c3e38086 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/substring.cpp @@ -51,15 +51,14 @@ class FunctionParserSubstring : public FunctionParser /** parse substring(str, start_index, length) as if (start_index == 0) - substring(str, start_index+1, length) + substring(str, 1, length) else substring(str, start_index, length) */ auto * const_zero_node = addColumnToActionsDAG(actions_dag, start_index_data_type, Field(0)); auto * const_one_node = addColumnToActionsDAG(actions_dag, start_index_data_type, Field(1)); auto * equals_zero_node = toFunctionNode(actions_dag, "equals", {parsed_args[1], const_zero_node}); - auto * index_plus_node = toFunctionNode(actions_dag, "plus", {parsed_args[1], const_one_node}); - auto * if_node = toFunctionNode(actions_dag, "if", {equals_zero_node, index_plus_node, parsed_args[1]}); + auto * if_node = toFunctionNode(actions_dag, "if", {equals_zero_node, const_one_node, parsed_args[1]}); const DB::ActionsDAG::Node * substring_func_node; if (parsed_args.size() == 2) substring_func_node = toFunctionNode(actions_dag, "substringUTF8", {parsed_args[0], if_node});