From fbaf8d7d3023719ef8ade09cee819e58a51abe01 Mon Sep 17 00:00:00 2001 From: Shuai li Date: Wed, 30 Aug 2023 21:51:31 +0800 Subject: [PATCH] [GLUTEN-2972][CH] Fix read field from excel format cause coredump (#2973) * fix bool whitespace * fix bool whitespace 2 --- .../resources/csv-data/whitespace_data.csv | 3 +- .../GlutenClickHouseFileFormatSuite.scala | 11 +- .../Serializations/ExcelBoolReader.cpp | 199 ++++++++++++++++++ .../Storages/Serializations/ExcelBoolReader.h | 32 +++ .../Serializations/ExcelSerialization.cpp | 8 +- .../Serializations/ExcelStringReader.cpp | 2 +- .../Serializations/ExcelStringReader.h | 2 +- .../SubstraitSource/ExcelTextFormatFile.cpp | 24 ++- 8 files changed, 265 insertions(+), 16 deletions(-) create mode 100644 cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.cpp create mode 100644 cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.h diff --git a/backends-clickhouse/src/test/resources/csv-data/whitespace_data.csv b/backends-clickhouse/src/test/resources/csv-data/whitespace_data.csv index 6ffe9606e2c4..092841704bf8 100644 --- a/backends-clickhouse/src/test/resources/csv-data/whitespace_data.csv +++ b/backends-clickhouse/src/test/resources/csv-data/whitespace_data.csv @@ -1 +1,2 @@ -1 ,1 ,1 ,1 ,1 \ No newline at end of file +1 ,1 ,1 ,1 ,1 ,True ,2023-08-30 18:00:01 ,2023-08-30 , +2 ,2 ,2 ,2 ,2 ,False ,2023-08-30 18:00:01 ,2023-08-30 , diff --git a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseFileFormatSuite.scala b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseFileFormatSuite.scala index 8eb1589bfc79..5ae52e20dad4 100644 --- a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseFileFormatSuite.scala +++ b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseFileFormatSuite.scala @@ -267,7 +267,11 @@ class GlutenClickHouseFileFormatSuite StructField.apply("long_field", LongType, nullable = true), StructField.apply("float_field", FloatType, nullable = true), StructField.apply("double_field", DoubleType, nullable = true), - StructField.apply("short_field", ShortType, nullable = true) + StructField.apply("short_field", ShortType, nullable = true), + StructField.apply("bool_field", BooleanType, nullable = true), + StructField.apply("timestamp_field", TimestampType, nullable = true), + StructField.apply("date_field", DateType, nullable = true), + StructField.apply("string_field", StringType, nullable = true) )) val options = new util.HashMap[String, String]() @@ -280,8 +284,11 @@ class GlutenClickHouseFileFormatSuite .csv(file_path) .toDF() + val tm1 = Timestamp.valueOf("2023-08-30 18:00:01") + val dt1 = Date.valueOf("2023-08-30") val dataCorrect = new util.ArrayList[Row]() - dataCorrect.add(Row(1, 1.toLong, 1.toFloat, 1.toDouble, 1.toShort)) + dataCorrect.add(Row(1, 1.toLong, 1.toFloat, 1.toDouble, 1.toShort, true, tm1, dt1, null)) + dataCorrect.add(Row(2, 2.toLong, 2.toFloat, 2.toDouble, 2.toShort, false, tm1, dt1, null)) var expectedAnswer: Seq[Row] = null withSQLConf(vanillaSparkConfs(): _*) { diff --git a/cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.cpp b/cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.cpp new file mode 100644 index 000000000000..5aa2108ccbaa --- /dev/null +++ b/cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.cpp @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include "ExcelBoolReader.h" + +namespace DB +{ +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_BOOL; + extern const int ILLEGAL_COLUMN; +} +} + +namespace local_engine +{ +using namespace DB; + + +DB::ColumnUInt8 * checkAndGetDeserializeColumnType(IColumn & column) +{ + auto * col = typeid_cast(&column); + if (!checkAndGetColumn(&column)) + throw Exception(DB::ErrorCodes::ILLEGAL_COLUMN, "Bool type can only deserialize columns of type UInt8.{}", column.getName()); + return col; +} + +bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr) +{ + if (checkCharCaseInsensitive('1', istr)) + { + column->insert(true); + } + else if (checkCharCaseInsensitive('0', istr)) + { + column->insert(false); + } + /// 'True' and 'T' + else if (checkCharCaseInsensitive('t', istr)) + { + /// Check if it's just short form `T` or full form `True` + if (checkCharCaseInsensitive('r', istr)) + { + if (!checkStringCaseInsensitive("ue", istr)) + return false; + } + column->insert(true); + } + /// 'False' and 'F' + else if (checkCharCaseInsensitive('f', istr)) + { + /// Check if it's just short form `F` or full form `False` + if (checkCharCaseInsensitive('a', istr)) + { + if (!checkStringCaseInsensitive("lse", istr)) + return false; + } + column->insert(false); + } + /// 'Yes' and 'Y' + else if (checkCharCaseInsensitive('y', istr)) + { + /// Check if it's just short form `Y` or full form `Yes` + if (checkCharCaseInsensitive('e', istr)) + { + if (!checkCharCaseInsensitive('s', istr)) + return false; + } + column->insert(true); + } + /// 'No' and 'N' + else if (checkCharCaseInsensitive('n', istr)) + { + /// Check if it's just short form `N` or full form `No` + checkCharCaseInsensitive('o', istr); + column->insert(false); + } + /// 'On' and 'Off' + else if (checkCharCaseInsensitive('o', istr)) + { + if (checkCharCaseInsensitive('n', istr)) + column->insert(true); + else if (checkStringCaseInsensitive("ff", istr)) + { + column->insert(false); + } + else + return false; + } + /// 'Enable' and 'Enabled' + else if (checkStringCaseInsensitive("enable", istr)) + { + /// Check if it's 'enable' or 'enabled' + checkCharCaseInsensitive('d', istr); + column->insert(true); + } + /// 'Disable' and 'Disabled' + else if (checkStringCaseInsensitive("disable", istr)) + { + /// Check if it's 'disable' or 'disabled' + checkCharCaseInsensitive('d', istr); + column->insert(false); + } + else + { + return false; + } + + return true; +} + +void deserializeImpl( + IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function check_end_of_value) +{ + DB::ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + + DB::PeekableReadBuffer buf(istr); + buf.setCheckpoint(); + if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf)) + { + col->insert(true); + return; + } + + buf.rollbackToCheckpoint(); + if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf)) + { + col->insert(false); + buf.dropCheckpoint(); + if (buf.hasUnreadData()) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return; + } + + buf.rollbackToCheckpoint(); + if (tryDeserializeAllVariants(col, buf) && check_end_of_value(buf)) + { + buf.dropCheckpoint(); + if (buf.hasUnreadData()) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return; + } + + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " + "bool_false_representation or one of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", + String(buf.position(), std::min(10lu, buf.available())), + settings.bool_true_representation, + settings.bool_false_representation); +} + + +void deserializeExcelBoolTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) +{ + if (istr.eof()) + throw DB::Exception(DB::ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + deserializeImpl( + column, + istr, + settings, + [&](DB::ReadBuffer & buf) + { + /// skip all chars before quote/delimiter exclude line delimiter + while (!buf.eof() && *buf.position() == ' ') + ++buf.position(); + + return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; + }); +} + +} diff --git a/cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.h b/cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.h new file mode 100644 index 000000000000..0e6460371e6a --- /dev/null +++ b/cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.h @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + + +namespace local_engine +{ +using namespace DB; + + + +void deserializeExcelBoolTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings); + +} diff --git a/cpp-ch/local-engine/Storages/Serializations/ExcelSerialization.cpp b/cpp-ch/local-engine/Storages/Serializations/ExcelSerialization.cpp index 574a18ffc36a..e9d00d0626cf 100644 --- a/cpp-ch/local-engine/Storages/Serializations/ExcelSerialization.cpp +++ b/cpp-ch/local-engine/Storages/Serializations/ExcelSerialization.cpp @@ -17,11 +17,13 @@ #include "ExcelSerialization.h" #include #include +#include #include #include #include #include #include +#include "ExcelBoolReader.h" #include "ExcelReadHelpers.h" #include "ExcelStringReader.h" @@ -89,7 +91,11 @@ void ExcelSerialization::deserializeTextCSV(IColumn & column, ReadBuffer & istr, } else if (typeid_cast(nested_ptr.get())) { - deserializeExcelTextCSV(column, istr, settings, escape); + deserializeExcelStringTextCSV(column, istr, settings, escape); + } + else if (typeid_cast(nested_ptr.get())) + { + deserializeExcelBoolTextCSV(column, istr, settings); } else { diff --git a/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.cpp b/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.cpp index c65214dce911..7966878953b5 100644 --- a/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.cpp +++ b/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.cpp @@ -227,7 +227,7 @@ void readExcelCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings:: } } -void deserializeExcelTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value) +void deserializeExcelStringTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value) { excelRead(column, [&](ColumnString::Chars & data) { readExcelCSVStringInto(data, istr, settings.csv, escape_value); }); } diff --git a/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.h b/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.h index be1633b3d69f..76b5f918546f 100644 --- a/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.h +++ b/cpp-ch/local-engine/Storages/Serializations/ExcelStringReader.h @@ -55,7 +55,7 @@ template void readExcelCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings, const String & escape_value); -void deserializeExcelTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value); +void deserializeExcelStringTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value); } diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp index 4b5ee9e7b8d4..b31b79b83205 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp @@ -242,6 +242,18 @@ bool ExcelTextFormatReader::readField( || (format_settings.csv.allow_double_quotes && maybe_quote == '\"')) has_quote = true; + auto column_back_func = [&column_size](DB::IColumn & column_back) -> void + { + if (column_back.isNullable()) + { + ColumnNullable & col = assert_cast(column_back); + if (col.getNullMapData().size() == column_size + 1) + col.getNullMapData().pop_back(); + if (col.getNestedColumn().size() == column_size + 1) + col.getNestedColumn().popBack(1); + } + }; + try { /// Read the column normally. @@ -254,9 +266,7 @@ bool ExcelTextFormatReader::readField( throw; skipErrorChars(*buf, has_quote, maybe_quote, format_settings); - - if (column.size() == column_size + 1) - column.popBack(1); + column_back_func(column); column.insertDefault(); return false; @@ -265,13 +275,7 @@ bool ExcelTextFormatReader::readField( if (column_size == column.size()) { skipErrorChars(*buf, has_quote, maybe_quote, format_settings); - if (column.isNullable()) - { - ColumnNullable & col = assert_cast(column); - if (col.getNullMapData().size() == column_size + 1) - col.getNullMapData().pop_back(); - } - + column_back_func(column); column.insertDefault(); return false; }