Skip to content

Commit

Permalink
fix: Align TimestampUnit and TimestampPrecision in unit tests (#11698)
Browse files Browse the repository at this point in the history
Summary:
Fixes #11607

Pull Request resolved: #11698

Reviewed By: xiaoxmeng

Differential Revision: D67186126

Pulled By: zacw7

fbshipit-source-id: bbd262cf69d685be1bf941278a2a395d6e9e36b3
  • Loading branch information
zuyu authored and facebook-github-bot committed Dec 13, 2024
1 parent 401d6c7 commit e86ff05
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 28 deletions.
7 changes: 3 additions & 4 deletions velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -906,16 +906,15 @@ core::TypedExprPtr extractFiltersFromRemainingFilter(
namespace {

#ifdef VELOX_ENABLE_PARQUET
std::optional<TimestampUnit> getTimestampUnit(
std::optional<TimestampPrecision> getTimestampUnit(
const config::ConfigBase& config,
const char* configKey) {
if (const auto unit = config.get<uint8_t>(configKey)) {
VELOX_CHECK(
unit == 0 /*second*/ || unit == 3 /*milli*/ || unit == 6 /*micro*/ ||
unit == 9 /*nano*/,
unit == 3 /*milli*/ || unit == 6 /*micro*/ || unit == 9 /*nano*/,
"Invalid timestamp unit: {}",
unit.value());
return std::optional(static_cast<TimestampUnit>(unit.value()));
return std::optional(static_cast<TimestampPrecision>(unit.value()));
}
return std::nullopt;
}
Expand Down
3 changes: 2 additions & 1 deletion velox/connectors/hive/tests/HiveConnectorUtilTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,8 @@ TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) {
auto parquetOptions =
std::dynamic_pointer_cast<parquet::WriterOptions>(options);
ASSERT_EQ(
parquetOptions->parquetWriteTimestampUnit.value(), TimestampUnit::kMilli);
parquetOptions->parquetWriteTimestampUnit.value(),
TimestampPrecision::kMilliseconds);
ASSERT_EQ(parquetOptions->parquetWriteTimestampTimeZone.value(), "UTC");
}
#endif
Expand Down
6 changes: 3 additions & 3 deletions velox/docs/configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ Each query can override the config by setting corresponding query session proper
- tinyint
- 9
- Timestamp unit used when writing timestamps into Parquet through Arrow bridge.
Valid values are 0 (second), 3 (millisecond), 6 (microsecond), 9 (nanosecond).
Valid values are 3 (millisecond), 6 (microsecond), and 9 (nanosecond).
* - hive.orc.writer.linear-stripe-size-heuristics
- orc_writer_linear_stripe_size_heuristics
- bool
Expand Down Expand Up @@ -709,7 +709,7 @@ These semantics are similar to the `Apache Hadoop-Aws module <https://hadoop.apa
* - fs.azure.account.auth.type.<storage-account>.dfs.core.windows.net
- string
- SharedKey
- Specifies the authentication mechanism to use for Azure storage accounts.
- Specifies the authentication mechanism to use for Azure storage accounts.
**Allowed values:** "SharedKey", "OAuth", "SAS".
"SharedKey": Uses the storage account name and key for authentication.
"OAuth": Utilizes OAuth tokens for secure authentication.
Expand Down Expand Up @@ -739,7 +739,7 @@ These semantics are similar to the `Apache Hadoop-Aws module <https://hadoop.apa
* - fs.azure.account.oauth2.client.endpoint.<storage-account>.dfs.core.windows.net
- string
-
- Specifies the OAuth 2.0 token endpoint URL for the Azure AD application.
- Specifies the OAuth 2.0 token endpoint URL for the Azure AD application.
This endpoint is used to acquire access tokens for authenticating with Azure storage.
The URL follows the format: `https://login.microsoftonline.com/<tenant-id>/oauth2/token`.

Expand Down
4 changes: 2 additions & 2 deletions velox/dwio/parquet/tests/reader/E2EFilterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ TEST_F(E2EFilterTest, integerDictionary) {
20);
}

TEST_F(E2EFilterTest, timestampDirect) {
TEST_F(E2EFilterTest, timestampInt96Direct) {
options_.enableDictionary = false;
options_.dataPageSize = 4 * 1024;
options_.writeInt96AsTimestamp = true;
Expand All @@ -272,7 +272,7 @@ TEST_F(E2EFilterTest, timestampDirect) {
20);
}

TEST_F(E2EFilterTest, timestampDictionary) {
TEST_F(E2EFilterTest, timestampInt96Dictionary) {
options_.dataPageSize = 4 * 1024;
options_.writeInt96AsTimestamp = true;

Expand Down
35 changes: 24 additions & 11 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,13 @@ class ParquetTableScanTest : public HiveConnectorTestBase {
rowType, subfieldFilters, remainingFilter, nullptr, assignments)
.planNode();

assertQuery(plan, splits_, sql);
AssertQueryBuilder(plan, duckDbQueryRunner_)
.connectorSessionProperty(
kHiveConnectorId,
HiveConfig::kReadTimestampUnitSession,
std::to_string(static_cast<int>(timestampPrecision_)))
.splits(splits_)
.assertResults(sql);
}

void assertSelectWithAgg(
Expand Down Expand Up @@ -211,6 +217,10 @@ class ParquetTableScanTest : public HiveConnectorTestBase {
rootPool_->addAggregateChild("ParquetTableScanTest.Writer");
options.memoryPool = childPool.get();

if (options.parquetWriteTimestampUnit.has_value()) {
timestampPrecision_ = options.parquetWriteTimestampUnit.value();
}

auto writer = std::make_unique<Writer>(
std::move(sink), options, asRowType(data[0]->type()));

Expand All @@ -220,7 +230,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase {
writer->close();
}

void testInt96TimestampRead(const WriterOptions& options) {
void testTimestampRead(const WriterOptions& options) {
auto stringToTimestamp = [](std::string_view view) {
return util::fromTimestampString(
view.data(),
Expand All @@ -231,16 +241,16 @@ class ParquetTableScanTest : public HiveConnectorTestBase {
});
};
std::vector<std::string_view> views = {
"2015-06-01 19:34:56",
"2015-06-02 19:34:56",
"2001-02-03 03:34:06",
"1998-03-01 08:01:06",
"2015-06-01 19:34:56.007",
"2015-06-02 19:34:56.12306",
"2001-02-03 03:34:06.056",
"1998-03-01 08:01:06.996669",
"2022-12-23 03:56:01",
"1980-01-24 00:23:07",
"1999-12-08 13:39:26",
"2023-04-21 09:09:34",
"1999-12-08 13:39:26.123456",
"2023-04-21 09:09:34.5",
"2000-09-12 22:36:29",
"2007-12-12 04:27:56",
"2007-12-12 04:27:56.999",
};
std::vector<Timestamp> values;
values.reserve(views.size());
Expand Down Expand Up @@ -298,6 +308,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase {

RowTypePtr rowType_;
std::vector<std::shared_ptr<connector::ConnectorSplit>> splits_;
TimestampPrecision timestampPrecision_ = TimestampPrecision::kMicroseconds;
};

TEST_F(ParquetTableScanTest, basic) {
Expand Down Expand Up @@ -820,14 +831,16 @@ TEST_F(ParquetTableScanTest, timestampInt96Dictionary) {
WriterOptions options;
options.writeInt96AsTimestamp = true;
options.enableDictionary = true;
testInt96TimestampRead(options);
options.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds;
testTimestampRead(options);
}

TEST_F(ParquetTableScanTest, timestampInt96Plain) {
WriterOptions options;
options.writeInt96AsTimestamp = true;
options.enableDictionary = false;
testInt96TimestampRead(options);
options.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds;
testTimestampRead(options);
}

TEST_F(ParquetTableScanTest, timestampPrecisionMicrosecond) {
Expand Down
6 changes: 3 additions & 3 deletions velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ DEBUG_ONLY_TEST_F(ParquetWriterTest, unitFromWriterOptions) {
10'000, [](auto row) { return Timestamp(row, row); })});
parquet::WriterOptions writerOptions;
writerOptions.memoryPool = leafPool_.get();
writerOptions.parquetWriteTimestampUnit = TimestampUnit::kMicro;
writerOptions.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds;
writerOptions.parquetWriteTimestampTimeZone = "America/Los_Angeles";

// Create an in-memory writer.
Expand Down Expand Up @@ -191,7 +191,7 @@ TEST_F(ParquetWriterTest, parquetWriteTimestampTimeZoneWithDefault) {
10'000, [](auto row) { return Timestamp(row, row); })});
parquet::WriterOptions writerOptions;
writerOptions.memoryPool = leafPool_.get();
writerOptions.parquetWriteTimestampUnit = TimestampUnit::kMicro;
writerOptions.parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds;

// Create an in-memory writer.
auto sink = std::make_unique<MemorySink>(
Expand Down Expand Up @@ -220,7 +220,7 @@ DEBUG_ONLY_TEST_F(ParquetWriterTest, unitFromHiveConfig) {
const auto outputDirectory = TempDirectoryPath::create();

auto writerOptions = std::make_shared<parquet::WriterOptions>();
writerOptions->parquetWriteTimestampUnit = TimestampUnit::kMicro;
writerOptions->parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds;

const auto plan = PlanBuilder()
.values({data})
Expand Down
3 changes: 2 additions & 1 deletion velox/dwio/parquet/writer/Writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ Writer::Writer(
flushPolicy_ = std::make_unique<DefaultFlushPolicy>();
}
options_.timestampUnit =
options.parquetWriteTimestampUnit.value_or(TimestampUnit::kNano);
static_cast<TimestampUnit>(options.parquetWriteTimestampUnit.value_or(
TimestampPrecision::kNanoseconds));
options_.timestampTimeZone = options.parquetWriteTimestampTimeZone;
arrowContext_->properties =
getArrowParquetWriterOptions(options, flushPolicy_);
Expand Down
4 changes: 2 additions & 2 deletions velox/dwio/parquet/writer/Writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ struct WriterOptions : public dwio::common::WriterOptions {
columnCompressionsMap;

/// Timestamp unit for Parquet write through Arrow bridge.
/// Default if not specified: TimestampUnit::kNano (9).
std::optional<TimestampUnit> parquetWriteTimestampUnit;
/// Default if not specified: TimestampPrecision::kNanoseconds (9).
std::optional<TimestampPrecision> parquetWriteTimestampUnit;
/// Timestamp time zone for Parquet write through Arrow bridge.
std::optional<std::string> parquetWriteTimestampTimeZone;
bool writeInt96AsTimestamp = false;
Expand Down
2 changes: 1 addition & 1 deletion velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void writeToFile(
options->memoryPool = pool;
// Spark does not recognize int64-timestamp written as nano precision in
// Parquet.
options->parquetWriteTimestampUnit = TimestampUnit::kMicro;
options->parquetWriteTimestampUnit = TimestampPrecision::kMicroseconds;

auto writeFile = std::make_unique<LocalWriteFile>(path, true, false);
auto sink =
Expand Down

0 comments on commit e86ff05

Please sign in to comment.