Skip to content

Commit

Permalink
make ut less change
Browse files Browse the repository at this point in the history
  • Loading branch information
lhuang09287750 committed Oct 31, 2023
1 parent 7449eb9 commit a5a6d82
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 84 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -154,30 +154,24 @@ class GlutenClickHouseFileFormatSuite
}

test("read data from csv file format") {
withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.empty_as_null",
"true")) {
val filePath = basePath + "/csv_test.csv"
val csvFileFormat = "csv"
val sql =
s"""
| select *
| from $csvFileFormat.`$filePath`
|""".stripMargin
testFileFormatBase(
filePath,
csvFileFormat,
sql,
df => {
val csvFileScan = collect(df.queryExecution.executedPlan) {
case f: FileSourceScanExecTransformer => f
}
assert(csvFileScan.size == 1)
val filePath = basePath + "/csv_test.csv"
val csvFileFormat = "csv"
val sql =
s"""
| select *
| from $csvFileFormat.`$filePath`
|""".stripMargin
testFileFormatBase(
filePath,
csvFileFormat,
sql,
df => {
val csvFileScan = collect(df.queryExecution.executedPlan) {
case f: FileSourceScanExecTransformer => f
}
)
}
assert(csvFileScan.size == 1)
}
)
}

test("read data from csv file format with filter") {
Expand All @@ -203,32 +197,26 @@ class GlutenClickHouseFileFormatSuite
}

test("read data from csv file format witsh agg") {
withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.empty_as_null",
"true")) {
val filePath = basePath + "/csv_test_agg.csv"
val csvFileFormat = "csv"
val sql =
s"""
| select _c7, count(_c0), sum(_c1), avg(_c2), min(_c3), max(_c4), sum(_c5), sum(_c8)
| from $csvFileFormat.`$filePath`
| group by _c7
|""".stripMargin
testFileFormatBase(
filePath,
csvFileFormat,
sql,
df => {
val csvFileScan = collect(df.queryExecution.executedPlan) {
case f: FileSourceScanExecTransformer => f
}
assert(csvFileScan.size == 1)
},
noFallBack = false
)
}
val filePath = basePath + "/csv_test_agg.csv"
val csvFileFormat = "csv"
val sql =
s"""
| select _c7, count(_c0), sum(_c1), avg(_c2), min(_c3), max(_c4), sum(_c5), sum(_c8)
| from $csvFileFormat.`$filePath`
| group by _c7
|""".stripMargin
testFileFormatBase(
filePath,
csvFileFormat,
sql,
df => {
val csvFileScan = collect(df.queryExecution.executedPlan) {
case f: FileSourceScanExecTransformer => f
}
assert(csvFileScan.size == 1)
},
noFallBack = false
)
}

test("read normal csv") {
Expand Down Expand Up @@ -904,15 +892,17 @@ class GlutenClickHouseFileFormatSuite
.csv(csvDataPath + "/escape_without_quote.csv")
.toDF()

val result = df.collect()

assert(result.length == 3)
assert(result.apply(0).getString(0) == "1\\")
assert(result.apply(0).getString(1) == "656")
assert(result.apply(1).getString(0) == "")
assert(result.apply(1).getString(1) == "123")
assert(result.apply(2).getString(0) == "123456789012345\\\\7")
assert(result.apply(2).getString(1) == "123")
var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.read
.option("delimiter", ",")
.option("escape", "\\")
.schema(schema)
.csv(csvDataPath + "/escape_without_quote.csv")
.toDF()
.collect()
}
checkAnswer(df, expectedAnswer)

val csvFileScan = collect(df.queryExecution.executedPlan) {
case f: FileSourceScanExecTransformer => f
Expand Down Expand Up @@ -1118,32 +1108,38 @@ class GlutenClickHouseFileFormatSuite
}

test("issue-2881 & issue-3542 null string test") {
val file_path = csvDataPath + "/null_string.csv"
val schema = StructType.apply(
Seq(
StructField.apply("c1", StringType, nullable = true),
StructField.apply("c2", ShortType, nullable = true)
))

val options = new util.HashMap[String, String]()
options.put("delimiter", ",")

val df = spark.read
.options(options)
.schema(schema)
.csv(file_path)
.toDF()
withSQLConf(
(
"spark.gluten.sql.columnar.backend.ch.runtime_settings." +
"use_excel_serialization.empty_as_null",
"false")) {
val file_path = csvDataPath + "/null_string.csv"
val schema = StructType.apply(
Seq(
StructField.apply("c1", StringType, nullable = true),
StructField.apply("c2", ShortType, nullable = true)
))

val options = new util.HashMap[String, String]()
options.put("delimiter", ",")

val df = spark.read
.options(options)
.schema(schema)
.csv(file_path)
.toDF()

val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(null, 1.toShort))
dataCorrect.add(Row("", 2.toShort))
dataCorrect.add(Row("1", 3.toShort))
val dataCorrect = new util.ArrayList[Row]()
dataCorrect.add(Row(null, 1.toShort))
dataCorrect.add(Row("", 2.toShort))
dataCorrect.add(Row("1", 3.toShort))

var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
var expectedAnswer: Seq[Row] = null
withSQLConf(vanillaSparkConfs(): _*) {
expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
}
checkAnswer(df, expectedAnswer)
}
checkAnswer(df, expectedAnswer)
}

test("test integer read with sign at the end of line") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,10 @@ DB::FormatSettings ExcelTextFormatFile::createFormatSettings()
if (!file_info.text().null_value().empty())
format_settings.csv.null_representation = file_info.text().null_value();

bool empty_as_null = context->getSettings().has(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL) &&
context->getSettings().getString(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL) == "'true'";

bool empty_as_null = true;
if (context->getSettings().has(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL))
empty_as_null = context->getSettings().getString(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL) == "'true'";

if (format_settings.csv.null_representation.empty() || empty_as_null)
format_settings.csv.empty_as_default = true;
else
Expand Down

0 comments on commit a5a6d82

Please sign in to comment.