Skip to content

Commit

Permalink
fix issue 3609 (#3614)
Browse files Browse the repository at this point in the history
  • Loading branch information
lhuang09287750 authored Nov 7, 2023
1 parent 9c2d057 commit a14baf3
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
15.01.02.01.002,CS-001,85.66%,15.01
Original file line number Diff line number Diff line change
Expand Up @@ -1233,6 +1233,74 @@ class GlutenClickHouseFileFormatSuite
)
}

test("issues-3609 int read test") {
withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.number_force",
"false")) {
val csv_path = csvDataPath + "/int_special.csv"
val options = new util.HashMap[String, String]()
options.put("delimiter", ",")
options.put("header", "false")
val schema = StructType.apply(
Seq(
StructField.apply("a", IntegerType, nullable = true),
StructField.apply("b", IntegerType, nullable = true),
StructField.apply("c", IntegerType, nullable = true),
StructField.apply("d", IntegerType, nullable = true)
))

val df = spark.read
.options(options)
.schema(schema)
.csv(csv_path)
.toDF()

val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(null, null, null, 15))

var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
}
checkAnswer(df, expectedAnswer)
}

withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.number_force",
"true")) {
val csv_path = csvDataPath + "/int_special.csv"
val options = new util.HashMap[String, String]()
options.put("delimiter", ",")
options.put("header", "false")
val schema = StructType.apply(
Seq(
StructField.apply("a", IntegerType, nullable = true),
StructField.apply("b", IntegerType, nullable = true),
StructField.apply("c", IntegerType, nullable = true),
StructField.apply("d", IntegerType, nullable = true)
))

val df = spark.read
.options(options)
.schema(schema)
.csv(csv_path)
.toDF()

val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(15, -1, 85, 15))

var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
}
checkAnswer(df, expectedAnswer)
}
}

def createEmptyParquet(): String = {
val data = spark.sparkContext.emptyRDD[Row]
val schema = new StructType()
Expand Down
1 change: 1 addition & 0 deletions cpp-ch/local-engine/Common/CHUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class BackendInitializerUtil
// use excel text parser
inline static const std::string USE_EXCEL_PARSER = "use_excel_serialization";
inline static const std::string EXCEL_EMPTY_AS_NULL = "use_excel_serialization.empty_as_null";
inline static const std::string EXCEL_NUMBER_FORCE = "use_excel_serialization.number_force";
inline static const String CH_BACKEND_PREFIX = "spark.gluten.sql.columnar.backend.ch";

inline static const String CH_RUNTIME_CONFIG = "runtime_config";
Expand Down
10 changes: 7 additions & 3 deletions cpp-ch/local-engine/Storages/Serializations/ExcelNumberReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ inline bool readExcelFloatTextFastImpl(T & x, DB::ReadBuffer & in, bool has_quot
template <typename T>
bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB::FormatSettings & settings)
{
bool number_force = settings.try_infer_integers==1;
const UInt8 MAX_HEAD_SKIP = 2;
const UInt8 MAX_TAIL_SKIP = 2;
UInt8 head_skip=0;
Expand Down Expand Up @@ -400,7 +401,10 @@ bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB:
{
if (!(*buf.position() >= '0' && *buf.position() <= '9'))
{
break;
if (number_force)
break;
else
return false;
}
else
{
Expand Down Expand Up @@ -448,7 +452,7 @@ bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB:
{
continue;
}
else if (has_number && !(*buf.position() >= '0' && *buf.position() <= '9')) // process suffix
else if (has_number && !(*buf.position() >= '0' && *buf.position() <= '9') && number_force) // process suffix
{
while (!buf.eof())
{
Expand All @@ -467,7 +471,7 @@ bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB:
}
break;
}
else if (!has_number && !(*buf.position() >= '0' && *buf.position() <= '9')) // process prefix
else if (!has_number && !(*buf.position() >= '0' && *buf.position() <= '9') && number_force) // process prefix
{
if(*buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r')
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ DB::FormatSettings ExcelTextFormatFile::createFormatSettings()
bool empty_as_null = true;
if (context->getSettings().has(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL))
empty_as_null = context->getSettings().getString(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL) == "'true'";

format_settings.try_infer_integers = 0;
if (!context->getSettings().has(BackendInitializerUtil::EXCEL_NUMBER_FORCE))
format_settings.try_infer_integers = 1;
if (context->getSettings().has(BackendInitializerUtil::EXCEL_NUMBER_FORCE) &&
context->getSettings().getString(BackendInitializerUtil::EXCEL_NUMBER_FORCE) == "'true'")
format_settings.try_infer_integers = 1;

if (format_settings.csv.null_representation.empty() || empty_as_null)
format_settings.csv.empty_as_default = true;
Expand Down

0 comments on commit a14baf3

Please sign in to comment.