fix issue 3609 (#3614)

apache · Nov 7, 2023 · a14baf3 · a14baf3
1 parent 9c2d057
commit a14baf3
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 3 deletions.
diff --git a/backends-clickhouse/src/test/resources/csv-data/int_special.csv b/backends-clickhouse/src/test/resources/csv-data/int_special.csv
@@ -0,0 +1 @@
+15.01.02.01.002,CS-001,85.66%,15.01
diff --git a/...lickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseFileFormatSuite.scala b/...lickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseFileFormatSuite.scala
@@ -1233,6 +1233,74 @@ class GlutenClickHouseFileFormatSuite
     )
   }
 
+  test("issues-3609 int read test") {
+    withSQLConf(
+      (
+        "spark.gluten.sql.columnar.backend.ch.runtime_settings." +
+          "use_excel_serialization.number_force",
+        "false")) {
+      val csv_path = csvDataPath + "/int_special.csv"
+      val options = new util.HashMap[String, String]()
+      options.put("delimiter", ",")
+      options.put("header", "false")
+      val schema = StructType.apply(
+        Seq(
+          StructField.apply("a", IntegerType, nullable = true),
+          StructField.apply("b", IntegerType, nullable = true),
+          StructField.apply("c", IntegerType, nullable = true),
+          StructField.apply("d", IntegerType, nullable = true)
+        ))
+
+      val df = spark.read
+        .options(options)
+        .schema(schema)
+        .csv(csv_path)
+        .toDF()
+
+      val dataCorrect = new util.ArrayList[Row]()
+      dataCorrect.add(Row(null, null, null, 15))
+
+      var expectedAnswer: Seq[Row] = null
+      withSQLConf(vanillaSparkConfs(): _*) {
+        expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
+      }
+      checkAnswer(df, expectedAnswer)
+    }
+
+    withSQLConf(
+      (
+        "spark.gluten.sql.columnar.backend.ch.runtime_settings." +
+          "use_excel_serialization.number_force",
+        "true")) {
+      val csv_path = csvDataPath + "/int_special.csv"
+      val options = new util.HashMap[String, String]()
+      options.put("delimiter", ",")
+      options.put("header", "false")
+      val schema = StructType.apply(
+        Seq(
+          StructField.apply("a", IntegerType, nullable = true),
+          StructField.apply("b", IntegerType, nullable = true),
+          StructField.apply("c", IntegerType, nullable = true),
+          StructField.apply("d", IntegerType, nullable = true)
+        ))
+
+      val df = spark.read
+        .options(options)
+        .schema(schema)
+        .csv(csv_path)
+        .toDF()
+
+      val dataCorrect = new util.ArrayList[Row]()
+      dataCorrect.add(Row(15, -1, 85, 15))
+
+      var expectedAnswer: Seq[Row] = null
+      withSQLConf(vanillaSparkConfs(): _*) {
+        expectedAnswer = spark.createDataFrame(dataCorrect, schema).toDF().collect()
+      }
+      checkAnswer(df, expectedAnswer)
+    }
+  }
+
   def createEmptyParquet(): String = {
     val data = spark.sparkContext.emptyRDD[Row]
     val schema = new StructType()

diff --git a/cpp-ch/local-engine/Common/CHUtil.h b/cpp-ch/local-engine/Common/CHUtil.h
@@ -126,6 +126,7 @@ class BackendInitializerUtil
     // use excel text parser
     inline static const std::string USE_EXCEL_PARSER = "use_excel_serialization";
     inline static const std::string EXCEL_EMPTY_AS_NULL = "use_excel_serialization.empty_as_null";
+    inline static const std::string EXCEL_NUMBER_FORCE = "use_excel_serialization.number_force";
     inline static const String CH_BACKEND_PREFIX = "spark.gluten.sql.columnar.backend.ch";
 
     inline static const String CH_RUNTIME_CONFIG = "runtime_config";

diff --git a/cpp-ch/local-engine/Storages/Serializations/ExcelNumberReader.h b/cpp-ch/local-engine/Storages/Serializations/ExcelNumberReader.h
@@ -333,6 +333,7 @@ inline bool readExcelFloatTextFastImpl(T & x, DB::ReadBuffer & in, bool has_quot
 template <typename T>
 bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB::FormatSettings & settings)
 {
+    bool number_force = settings.try_infer_integers==1;
     const UInt8 MAX_HEAD_SKIP = 2;
     const UInt8 MAX_TAIL_SKIP = 2;
     UInt8 head_skip=0;
@@ -400,7 +401,10 @@ bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB:
                 {
                     if (!(*buf.position() >= '0' && *buf.position() <= '9'))
                     {
-                        break;
+                        if (number_force)
+                            break;
+                        else
+                            return false;
                     }
                     else
                     {
@@ -448,7 +452,7 @@ bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB:
         {
             continue;
         }
-        else if (has_number && !(*buf.position() >= '0' && *buf.position() <= '9')) // process suffix
+        else if (has_number && !(*buf.position() >= '0' && *buf.position() <= '9') && number_force) // process suffix
         {
             while (!buf.eof())
             {
@@ -467,7 +471,7 @@ bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB:
             }
             break;
         }
-        else if (!has_number && !(*buf.position() >= '0' && *buf.position() <= '9')) // process prefix
+        else if (!has_number && !(*buf.position() >= '0' && *buf.position() <= '9') && number_force) // process prefix
         {
             if(*buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r')
             {

diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ExcelTextFormatFile.cpp
@@ -105,6 +105,13 @@ DB::FormatSettings ExcelTextFormatFile::createFormatSettings()
     bool empty_as_null = true;
     if (context->getSettings().has(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL))
         empty_as_null = context->getSettings().getString(BackendInitializerUtil::EXCEL_EMPTY_AS_NULL) == "'true'";
+
+    format_settings.try_infer_integers = 0;
+    if (!context->getSettings().has(BackendInitializerUtil::EXCEL_NUMBER_FORCE))
+        format_settings.try_infer_integers = 1;
+    if (context->getSettings().has(BackendInitializerUtil::EXCEL_NUMBER_FORCE) &&
+        context->getSettings().getString(BackendInitializerUtil::EXCEL_NUMBER_FORCE) == "'true'")
+        format_settings.try_infer_integers = 1;
 
     if (format_settings.csv.null_representation.empty() || empty_as_null)
         format_settings.csv.empty_as_default = true;