Skip to content

Commit

Permalink
[GLUTEN-3542][CH] Cancel empty string as null representation when rea…
Browse files Browse the repository at this point in the history
…di… (#3543)

* fix issue-3542, cancel empty string as null representation when reading csv with excel format

* fix issue-3542, add a switch to control if treat empty string as null when read csv with excel format

* fix checkstyle problem

* make ut less change

* fix issue-3542 UT
  • Loading branch information
lhuang09287750 authored Nov 3, 2023
1 parent 07ba657 commit 1bacef5
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1108,32 +1108,73 @@ class GlutenClickHouseFileFormatSuite
}

test("issue-2881 null string test") {
val file_path = csvDataPath + "/null_string.csv"
val schema = StructType.apply(
Seq(
StructField.apply("c1", StringType, nullable = true),
StructField.apply("c2", ShortType, nullable = true)
))
withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.empty_as_null",
"true")) {
val file_path = csvDataPath + "/null_string.csv"
val schema = StructType.apply(
Seq(
StructField.apply("c1", StringType, nullable = true),
StructField.apply("c2", ShortType, nullable = true)
))

val options = new util.HashMap[String, String]()
options.put("delimiter", ",")

val df = spark.read
.options(options)
.schema(schema)
.csv(file_path)
.toDF()

val options = new util.HashMap[String, String]()
options.put("delimiter", ",")
val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(null, 1.toShort))
dataCorrect.add(Row(null, 2.toShort))
dataCorrect.add(Row("1", 3.toShort))

val df = spark.read
.options(options)
.schema(schema)
.csv(file_path)
.toDF()
var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
}
checkAnswer(df, expectedAnswer)
}
}

val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(null, 1.toShort))
dataCorrect.add(Row(null, 2.toShort))
dataCorrect.add(Row("1", 3.toShort))
test("issue-3542 null string test") {
withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.empty_as_null",
"false")) {
val file_path = csvDataPath + "/null_string.csv"
val schema = StructType.apply(
Seq(
StructField.apply("c1", StringType, nullable = true),
StructField.apply("c2", ShortType, nullable = true)
))

val options = new util.HashMap[String, String]()
options.put("delimiter", ",")

val df = spark.read
.options(options)
.schema(schema)
.csv(file_path)
.toDF()

var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(null, 1.toShort))
dataCorrect.add(Row("", 2.toShort))
dataCorrect.add(Row("1", 3.toShort))

var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
}
checkAnswer(df, expectedAnswer)
}
checkAnswer(df, expectedAnswer)
}

test("test integer read with sign at the end of line") {
Expand Down
1 change: 1 addition & 0 deletions cpp-ch/local-engine/Common/CHUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class BackendInitializerUtil

// use excel text parser
inline static const std::string USE_EXCEL_PARSER = "use_excel_serialization";
inline static const std::string EXCEL_EMPTY_AS_NULL = "use_excel_serialization.empty_as_null";
inline static const String CH_BACKEND_PREFIX = "spark.gluten.sql.columnar.backend.ch";

inline static const String CH_RUNTIME_CONFIG = "runtime_config";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "Common/CHUtil.h"
#include "ExcelTextFormatFile.h"


Expand Down Expand Up @@ -101,7 +102,14 @@ DB::FormatSettings ExcelTextFormatFile::createFormatSettings()
if (!file_info.text().null_value().empty())
format_settings.csv.null_representation = file_info.text().null_value();

format_settings.csv.empty_as_default = true;
bool empty_as_null = true;
if (context->getSettings().has(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL))
empty_as_null = context->getSettings().getString(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL) == "'true'";

if (format_settings.csv.null_representation.empty() || empty_as_null)
format_settings.csv.empty_as_default = true;
else
format_settings.csv.empty_as_default = false;

char quote = *file_info.text().quote().data();
if (quote == '\'')
Expand Down

0 comments on commit 1bacef5

Please sign in to comment.