From e86ff05160cab0b5df5bafc7934954d0b602b74f Mon Sep 17 00:00:00 2001 From: Zuyu ZHANG Date: Fri, 13 Dec 2024 00:21:01 -0800 Subject: [PATCH] fix: Align TimestampUnit and TimestampPrecision in unit tests (#11698) Summary: Fixes https://github.com/facebookincubator/velox/issues/11607 Pull Request resolved: https://github.com/facebookincubator/velox/pull/11698 Reviewed By: xiaoxmeng Differential Revision: D67186126 Pulled By: zacw7 fbshipit-source-id: bbd262cf69d685be1bf941278a2a395d6e9e36b3 --- velox/connectors/hive/HiveConnectorUtil.cpp | 7 ++-- .../hive/tests/HiveConnectorUtilTest.cpp | 3 +- velox/docs/configs.rst | 6 ++-- .../parquet/tests/reader/E2EFilterTest.cpp | 4 +-- .../tests/reader/ParquetTableScanTest.cpp | 35 +++++++++++++------ .../tests/writer/ParquetWriterTest.cpp | 6 ++-- velox/dwio/parquet/writer/Writer.cpp | 3 +- velox/dwio/parquet/writer/Writer.h | 4 +-- .../sparksql/fuzzer/SparkQueryRunner.cpp | 2 +- 9 files changed, 42 insertions(+), 28 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index e1694486b9de..7ed63620ee3e 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -906,16 +906,15 @@ core::TypedExprPtr extractFiltersFromRemainingFilter( namespace { #ifdef VELOX_ENABLE_PARQUET -std::optional getTimestampUnit( +std::optional getTimestampUnit( const config::ConfigBase& config, const char* configKey) { if (const auto unit = config.get(configKey)) { VELOX_CHECK( - unit == 0 /*second*/ || unit == 3 /*milli*/ || unit == 6 /*micro*/ || - unit == 9 /*nano*/, + unit == 3 /*milli*/ || unit == 6 /*micro*/ || unit == 9 /*nano*/, "Invalid timestamp unit: {}", unit.value()); - return std::optional(static_cast(unit.value())); + return std::optional(static_cast(unit.value())); } return std::nullopt; } diff --git a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp index b7a7ba335c12..f6b687ad915e 100644 --- a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp @@ -424,7 +424,8 @@ TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) { auto parquetOptions = std::dynamic_pointer_cast(options); ASSERT_EQ( - parquetOptions->parquetWriteTimestampUnit.value(), TimestampUnit::kMilli); + parquetOptions->parquetWriteTimestampUnit.value(), + TimestampPrecision::kMilliseconds); ASSERT_EQ(parquetOptions->parquetWriteTimestampTimeZone.value(), "UTC"); } #endif diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index f25b89a5518e..af7e26a60abb 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -558,7 +558,7 @@ Each query can override the config by setting corresponding query session proper - tinyint - 9 - Timestamp unit used when writing timestamps into Parquet through Arrow bridge. - Valid values are 0 (second), 3 (millisecond), 6 (microsecond), 9 (nanosecond). + Valid values are 3 (millisecond), 6 (microsecond), and 9 (nanosecond). * - hive.orc.writer.linear-stripe-size-heuristics - orc_writer_linear_stripe_size_heuristics - bool @@ -709,7 +709,7 @@ These semantics are similar to the `Apache Hadoop-Aws module .dfs.core.windows.net - string - SharedKey - - Specifies the authentication mechanism to use for Azure storage accounts. + - Specifies the authentication mechanism to use for Azure storage accounts. **Allowed values:** "SharedKey", "OAuth", "SAS". "SharedKey": Uses the storage account name and key for authentication. "OAuth": Utilizes OAuth tokens for secure authentication. @@ -739,7 +739,7 @@ These semantics are similar to the `Apache Hadoop-Aws module .dfs.core.windows.net - string - - - Specifies the OAuth 2.0 token endpoint URL for the Azure AD application. + - Specifies the OAuth 2.0 token endpoint URL for the Azure AD application. This endpoint is used to acquire access tokens for authenticating with Azure storage. The URL follows the format: `https://login.microsoftonline.com//oauth2/token`. diff --git a/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp b/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp index 2845659deb80..7f2dc8facb4b 100644 --- a/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp +++ b/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp @@ -258,7 +258,7 @@ TEST_F(E2EFilterTest, integerDictionary) { 20); } -TEST_F(E2EFilterTest, timestampDirect) { +TEST_F(E2EFilterTest, timestampInt96Direct) { options_.enableDictionary = false; options_.dataPageSize = 4 * 1024; options_.writeInt96AsTimestamp = true; @@ -272,7 +272,7 @@ TEST_F(E2EFilterTest, timestampDirect) { 20); } -TEST_F(E2EFilterTest, timestampDictionary) { +TEST_F(E2EFilterTest, timestampInt96Dictionary) { options_.dataPageSize = 4 * 1024; options_.writeInt96AsTimestamp = true; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 2c239e14256b..20826137cba1 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -99,7 +99,13 @@ class ParquetTableScanTest : public HiveConnectorTestBase { rowType, subfieldFilters, remainingFilter, nullptr, assignments) .planNode(); - assertQuery(plan, splits_, sql); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kReadTimestampUnitSession, + std::to_string(static_cast(timestampPrecision_))) + .splits(splits_) + .assertResults(sql); } void assertSelectWithAgg( @@ -211,6 +217,10 @@ class ParquetTableScanTest : public HiveConnectorTestBase { rootPool_->addAggregateChild("ParquetTableScanTest.Writer"); options.memoryPool = childPool.get(); + if (options.parquetWriteTimestampUnit.has_value()) { + timestampPrecision_ = options.parquetWriteTimestampUnit.value(); + } + auto writer = std::make_unique( std::move(sink), options, asRowType(data[0]->type())); @@ -220,7 +230,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase { writer->close(); } - void testInt96TimestampRead(const WriterOptions& options) { + void testTimestampRead(const WriterOptions& options) { auto stringToTimestamp = [](std::string_view view) { return util::fromTimestampString( view.data(), @@ -231,16 +241,16 @@ class ParquetTableScanTest : public HiveConnectorTestBase { }); }; std::vector views = { - "2015-06-01 19:34:56", - "2015-06-02 19:34:56", - "2001-02-03 03:34:06", - "1998-03-01 08:01:06", + "2015-06-01 19:34:56.007", + "2015-06-02 19:34:56.12306", + "2001-02-03 03:34:06.056", + "1998-03-01 08:01:06.996669", "2022-12-23 03:56:01", "1980-01-24 00:23:07", - "1999-12-08 13:39:26", - "2023-04-21 09:09:34", + "1999-12-08 13:39:26.123456", + "2023-04-21 09:09:34.5", "2000-09-12 22:36:29", - "2007-12-12 04:27:56", + "2007-12-12 04:27:56.999", }; std::vector values; values.reserve(views.size()); @@ -298,6 +308,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase { RowTypePtr rowType_; std::vector> splits_; + TimestampPrecision timestampPrecision_ = TimestampPrecision::kMicroseconds; }; TEST_F(ParquetTableScanTest, basic) { @@ -820,14 +831,16 @@ TEST_F(ParquetTableScanTest, timestampInt96Dictionary) { WriterOptions options; options.writeInt96AsTimestamp = true; options.enableDictionary = true; - testInt96TimestampRead(options); + options.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds; + testTimestampRead(options); } TEST_F(ParquetTableScanTest, timestampInt96Plain) { WriterOptions options; options.writeInt96AsTimestamp = true; options.enableDictionary = false; - testInt96TimestampRead(options); + options.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds; + testTimestampRead(options); } TEST_F(ParquetTableScanTest, timestampPrecisionMicrosecond) { diff --git a/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp b/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp index c7510d27bff8..0e93a840d564 100644 --- a/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp +++ b/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp @@ -162,7 +162,7 @@ DEBUG_ONLY_TEST_F(ParquetWriterTest, unitFromWriterOptions) { 10'000, [](auto row) { return Timestamp(row, row); })}); parquet::WriterOptions writerOptions; writerOptions.memoryPool = leafPool_.get(); - writerOptions.parquetWriteTimestampUnit = TimestampUnit::kMicro; + writerOptions.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds; writerOptions.parquetWriteTimestampTimeZone = "America/Los_Angeles"; // Create an in-memory writer. @@ -191,7 +191,7 @@ TEST_F(ParquetWriterTest, parquetWriteTimestampTimeZoneWithDefault) { 10'000, [](auto row) { return Timestamp(row, row); })}); parquet::WriterOptions writerOptions; writerOptions.memoryPool = leafPool_.get(); - writerOptions.parquetWriteTimestampUnit = TimestampUnit::kMicro; + writerOptions.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds; // Create an in-memory writer. auto sink = std::make_unique( @@ -220,7 +220,7 @@ DEBUG_ONLY_TEST_F(ParquetWriterTest, unitFromHiveConfig) { const auto outputDirectory = TempDirectoryPath::create(); auto writerOptions = std::make_shared(); - writerOptions->parquetWriteTimestampUnit = TimestampUnit::kMicro; + writerOptions->parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds; const auto plan = PlanBuilder() .values({data}) diff --git a/velox/dwio/parquet/writer/Writer.cpp b/velox/dwio/parquet/writer/Writer.cpp index bce6919a7cc6..164d06971c4a 100644 --- a/velox/dwio/parquet/writer/Writer.cpp +++ b/velox/dwio/parquet/writer/Writer.cpp @@ -239,7 +239,8 @@ Writer::Writer( flushPolicy_ = std::make_unique(); } options_.timestampUnit = - options.parquetWriteTimestampUnit.value_or(TimestampUnit::kNano); + static_cast(options.parquetWriteTimestampUnit.value_or( + TimestampPrecision::kNanoseconds)); options_.timestampTimeZone = options.parquetWriteTimestampTimeZone; arrowContext_->properties = getArrowParquetWriterOptions(options, flushPolicy_); diff --git a/velox/dwio/parquet/writer/Writer.h b/velox/dwio/parquet/writer/Writer.h index 26969a59d52f..20c8ab2b3291 100644 --- a/velox/dwio/parquet/writer/Writer.h +++ b/velox/dwio/parquet/writer/Writer.h @@ -103,8 +103,8 @@ struct WriterOptions : public dwio::common::WriterOptions { columnCompressionsMap; /// Timestamp unit for Parquet write through Arrow bridge. - /// Default if not specified: TimestampUnit::kNano (9). - std::optional parquetWriteTimestampUnit; + /// Default if not specified: TimestampPrecision::kNanoseconds (9). + std::optional parquetWriteTimestampUnit; /// Timestamp time zone for Parquet write through Arrow bridge. std::optional parquetWriteTimestampTimeZone; bool writeInt96AsTimestamp = false; diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp index 07cf0dd70392..aeec3cc5352f 100644 --- a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp +++ b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp @@ -51,7 +51,7 @@ void writeToFile( options->memoryPool = pool; // Spark does not recognize int64-timestamp written as nano precision in // Parquet. - options->parquetWriteTimestampUnit = TimestampUnit::kMicro; + options->parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds; auto writeFile = std::make_unique(path, true, false); auto sink =