Skip to content

Commit

Permalink
[GLUTEN-2972][CH] Fix read field from excel format cause coredump (ap…
Browse files Browse the repository at this point in the history
…ache#2973)

* fix bool whitespace

* fix bool whitespace 2
  • Loading branch information
loneylee authored Aug 30, 2023
1 parent 4a4d995 commit fbaf8d7
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 16 deletions.
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
1 ,1 ,1 ,1 ,1
1 ,1 ,1 ,1 ,1 ,True ,2023-08-30 18:00:01 ,2023-08-30 ,
2 ,2 ,2 ,2 ,2 ,False ,2023-08-30 18:00:01 ,2023-08-30 ,
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,11 @@ class GlutenClickHouseFileFormatSuite
StructField.apply("long_field", LongType, nullable = true),
StructField.apply("float_field", FloatType, nullable = true),
StructField.apply("double_field", DoubleType, nullable = true),
StructField.apply("short_field", ShortType, nullable = true)
StructField.apply("short_field", ShortType, nullable = true),
StructField.apply("bool_field", BooleanType, nullable = true),
StructField.apply("timestamp_field", TimestampType, nullable = true),
StructField.apply("date_field", DateType, nullable = true),
StructField.apply("string_field", StringType, nullable = true)
))

val options = new util.HashMap[String, String]()
Expand All @@ -280,8 +284,11 @@ class GlutenClickHouseFileFormatSuite
.csv(file_path)
.toDF()

val tm1 = Timestamp.valueOf("2023-08-30 18:00:01")
val dt1 = Date.valueOf("2023-08-30")
val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(1, 1.toLong, 1.toFloat, 1.toDouble, 1.toShort))
dataCorrect.add(Row(1, 1.toLong, 1.toFloat, 1.toDouble, 1.toShort, true, tm1, dt1, null))
dataCorrect.add(Row(2, 2.toLong, 2.toFloat, 2.toDouble, 2.toShort, false, tm1, dt1, null))

var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
Expand Down
199 changes: 199 additions & 0 deletions cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <Columns/ColumnsNumber.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/ReadHelpers.h>

#include "ExcelBoolReader.h"

namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_PARSE_BOOL;
extern const int ILLEGAL_COLUMN;
}
}

namespace local_engine
{
using namespace DB;


DB::ColumnUInt8 * checkAndGetDeserializeColumnType(IColumn & column)
{
auto * col = typeid_cast<DB::ColumnUInt8 *>(&column);
if (!checkAndGetColumn<DB::ColumnUInt8>(&column))
throw Exception(DB::ErrorCodes::ILLEGAL_COLUMN, "Bool type can only deserialize columns of type UInt8.{}", column.getName());
return col;
}

bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr)
{
if (checkCharCaseInsensitive('1', istr))
{
column->insert(true);
}
else if (checkCharCaseInsensitive('0', istr))
{
column->insert(false);
}
/// 'True' and 'T'
else if (checkCharCaseInsensitive('t', istr))
{
/// Check if it's just short form `T` or full form `True`
if (checkCharCaseInsensitive('r', istr))
{
if (!checkStringCaseInsensitive("ue", istr))
return false;
}
column->insert(true);
}
/// 'False' and 'F'
else if (checkCharCaseInsensitive('f', istr))
{
/// Check if it's just short form `F` or full form `False`
if (checkCharCaseInsensitive('a', istr))
{
if (!checkStringCaseInsensitive("lse", istr))
return false;
}
column->insert(false);
}
/// 'Yes' and 'Y'
else if (checkCharCaseInsensitive('y', istr))
{
/// Check if it's just short form `Y` or full form `Yes`
if (checkCharCaseInsensitive('e', istr))
{
if (!checkCharCaseInsensitive('s', istr))
return false;
}
column->insert(true);
}
/// 'No' and 'N'
else if (checkCharCaseInsensitive('n', istr))
{
/// Check if it's just short form `N` or full form `No`
checkCharCaseInsensitive('o', istr);
column->insert(false);
}
/// 'On' and 'Off'
else if (checkCharCaseInsensitive('o', istr))
{
if (checkCharCaseInsensitive('n', istr))
column->insert(true);
else if (checkStringCaseInsensitive("ff", istr))
{
column->insert(false);
}
else
return false;
}
/// 'Enable' and 'Enabled'
else if (checkStringCaseInsensitive("enable", istr))
{
/// Check if it's 'enable' or 'enabled'
checkCharCaseInsensitive('d', istr);
column->insert(true);
}
/// 'Disable' and 'Disabled'
else if (checkStringCaseInsensitive("disable", istr))
{
/// Check if it's 'disable' or 'disabled'
checkCharCaseInsensitive('d', istr);
column->insert(false);
}
else
{
return false;
}

return true;
}

void deserializeImpl(
IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function<bool(ReadBuffer &)> check_end_of_value)
{
DB::ColumnUInt8 * col = checkAndGetDeserializeColumnType(column);

DB::PeekableReadBuffer buf(istr);
buf.setCheckpoint();
if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf))
{
col->insert(true);
return;
}

buf.rollbackToCheckpoint();
if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf))
{
col->insert(false);
buf.dropCheckpoint();
if (buf.hasUnreadData())
throw Exception(
ErrorCodes::CANNOT_PARSE_BOOL,
"Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if "
"bool_true_representation or bool_false_representation contains some delimiters of input format");
return;
}

buf.rollbackToCheckpoint();
if (tryDeserializeAllVariants(col, buf) && check_end_of_value(buf))
{
buf.dropCheckpoint();
if (buf.hasUnreadData())
throw Exception(
ErrorCodes::CANNOT_PARSE_BOOL,
"Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if "
"bool_true_representation or bool_false_representation contains some delimiters of input format");
return;
}

buf.makeContinuousMemoryFromCheckpointToPos();
buf.rollbackToCheckpoint();
throw Exception(
ErrorCodes::CANNOT_PARSE_BOOL,
"Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and "
"bool_false_representation or one of "
"True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0",
String(buf.position(), std::min(10lu, buf.available())),
settings.bool_true_representation,
settings.bool_false_representation);
}


void deserializeExcelBoolTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings)
{
if (istr.eof())
throw DB::Exception(DB::ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF.");

deserializeImpl(
column,
istr,
settings,
[&](DB::ReadBuffer & buf)
{
/// skip all chars before quote/delimiter exclude line delimiter
while (!buf.eof() && *buf.position() == ' ')
++buf.position();

return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r';
});
}

}
32 changes: 32 additions & 0 deletions cpp-ch/local-engine/Storages/Serializations/ExcelBoolReader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <Columns/IColumn.h>
#include <Formats/FormatSettings.h>
#include <IO/ReadBuffer.h>


namespace local_engine
{
using namespace DB;



void deserializeExcelBoolTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings);

}
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
#include "ExcelSerialization.h"
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/Serializations/SerializationBool.h>
#include <DataTypes/Serializations/SerializationDate32.h>
#include <DataTypes/Serializations/SerializationDateTime64.h>
#include <DataTypes/Serializations/SerializationDecimal.h>
#include <DataTypes/Serializations/SerializationNumber.h>
#include <DataTypes/Serializations/SerializationString.h>
#include "ExcelBoolReader.h"
#include "ExcelReadHelpers.h"
#include "ExcelStringReader.h"

Expand Down Expand Up @@ -89,7 +91,11 @@ void ExcelSerialization::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
}
else if (typeid_cast<const SerializationString *>(nested_ptr.get()))
{
deserializeExcelTextCSV(column, istr, settings, escape);
deserializeExcelStringTextCSV(column, istr, settings, escape);
}
else if (typeid_cast<const SerializationBool *>(nested_ptr.get()))
{
deserializeExcelBoolTextCSV(column, istr, settings);
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ void readExcelCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::
}
}

void deserializeExcelTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value)
void deserializeExcelStringTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value)
{
excelRead(column, [&](ColumnString::Chars & data) { readExcelCSVStringInto(data, istr, settings.csv, escape_value); });
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ template <typename Vector, bool include_quotes = false>
void readExcelCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings, const String & escape_value);


void deserializeExcelTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value);
void deserializeExcelStringTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & escape_value);


}
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,18 @@ bool ExcelTextFormatReader::readField(
|| (format_settings.csv.allow_double_quotes && maybe_quote == '\"'))
has_quote = true;

auto column_back_func = [&column_size](DB::IColumn & column_back) -> void
{
if (column_back.isNullable())
{
ColumnNullable & col = assert_cast<ColumnNullable &>(column_back);
if (col.getNullMapData().size() == column_size + 1)
col.getNullMapData().pop_back();
if (col.getNestedColumn().size() == column_size + 1)
col.getNestedColumn().popBack(1);
}
};

try
{
/// Read the column normally.
Expand All @@ -254,9 +266,7 @@ bool ExcelTextFormatReader::readField(
throw;

skipErrorChars(*buf, has_quote, maybe_quote, format_settings);

if (column.size() == column_size + 1)
column.popBack(1);
column_back_func(column);
column.insertDefault();

return false;
Expand All @@ -265,13 +275,7 @@ bool ExcelTextFormatReader::readField(
if (column_size == column.size())
{
skipErrorChars(*buf, has_quote, maybe_quote, format_settings);
if (column.isNullable())
{
ColumnNullable & col = assert_cast<ColumnNullable &>(column);
if (col.getNullMapData().size() == column_size + 1)
col.getNullMapData().pop_back();
}

column_back_func(column);
column.insertDefault();
return false;
}
Expand Down

0 comments on commit fbaf8d7

Please sign in to comment.