From 8cc7aa60d9b8f896c83022d390d23b6f7eaacd54 Mon Sep 17 00:00:00 2001 From: rexan Date: Wed, 6 Nov 2024 22:16:05 -0500 Subject: [PATCH] Add Spark timestamp to int --- velox/expression/CastExpr-inl.h | 14 ++++++ velox/expression/CastHooks.h | 2 + velox/expression/PrestoCastHooks.cpp | 9 +++- velox/expression/PrestoCastHooks.h | 2 + .../sparksql/specialforms/SparkCastHooks.cpp | 11 +++-- .../sparksql/specialforms/SparkCastHooks.h | 2 + .../sparksql/tests/SparkCastExprTest.cpp | 46 +++++++++++++++++++ velox/type/Timestamp.h | 24 ++++++---- 8 files changed, 96 insertions(+), 14 deletions(-) diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h index f660b992fca7..9b782dc5714e 100644 --- a/velox/expression/CastExpr-inl.h +++ b/velox/expression/CastExpr-inl.h @@ -286,6 +286,20 @@ void CastExpr::applyCastKernel( return; } + if constexpr ( + (ToKind == TypeKind::TINYINT || ToKind == TypeKind::SMALLINT || + ToKind == TypeKind::INTEGER || ToKind == TypeKind::BIGINT) && + FromKind == TypeKind::TIMESTAMP) { + using To = typename TypeTraits::NativeType; + const auto castResult = hooks_->castTimestampToInt(inputRowValue); + if (castResult.hasError()) { + setError(castResult.error().message()); + } else { + result->set(row, static_cast(castResult.value())); + } + return; + } + // Optimize empty input strings casting by avoiding throwing exceptions. if constexpr ( FromKind == TypeKind::VARCHAR || FromKind == TypeKind::VARBINARY) { diff --git a/velox/expression/CastHooks.h b/velox/expression/CastHooks.h index 85231846e346..e892f49dbb38 100644 --- a/velox/expression/CastHooks.h +++ b/velox/expression/CastHooks.h @@ -37,6 +37,8 @@ class CastHooks { virtual Expected castIntToTimestamp(int64_t seconds) const = 0; + virtual Expected castTimestampToInt(Timestamp timestamp) const = 0; + virtual Expected castStringToDate( const StringView& dateString) const = 0; diff --git a/velox/expression/PrestoCastHooks.cpp b/velox/expression/PrestoCastHooks.cpp index 9a2b64e5a70a..3c3250051ba0 100644 --- a/velox/expression/PrestoCastHooks.cpp +++ b/velox/expression/PrestoCastHooks.cpp @@ -67,11 +67,18 @@ Expected PrestoCastHooks::castStringToTimestamp( return result.first; } -Expected PrestoCastHooks::castIntToTimestamp(int64_t seconds) const { +Expected PrestoCastHooks::castIntToTimestamp( + int64_t /*seconds*/) const { return folly::makeUnexpected( Status::UserError("Conversion to Timestamp is not supported")); } +Expected PrestoCastHooks::castTimestampToInt( + Timestamp /*timestamp*/) const { + return folly::makeUnexpected( + Status::UserError("Conversion from Timestamp is not supported")); +} + Expected PrestoCastHooks::castStringToDate( const StringView& dateString) const { // Cast from string to date allows only complete ISO 8601 formatted strings: diff --git a/velox/expression/PrestoCastHooks.h b/velox/expression/PrestoCastHooks.h index a0a1fce7491f..3ee520bea197 100644 --- a/velox/expression/PrestoCastHooks.h +++ b/velox/expression/PrestoCastHooks.h @@ -32,6 +32,8 @@ class PrestoCastHooks : public CastHooks { Expected castIntToTimestamp(int64_t seconds) const override; + Expected castTimestampToInt(Timestamp timestamp) const override; + // Uses standard cast mode to cast from string to date. Expected castStringToDate( const StringView& dateString) const override; diff --git a/velox/functions/sparksql/specialforms/SparkCastHooks.cpp b/velox/functions/sparksql/specialforms/SparkCastHooks.cpp index fcab616e1b07..747fa9b633f3 100644 --- a/velox/functions/sparksql/specialforms/SparkCastHooks.cpp +++ b/velox/functions/sparksql/specialforms/SparkCastHooks.cpp @@ -29,9 +29,8 @@ Expected SparkCastHooks::castStringToTimestamp( Expected SparkCastHooks::castIntToTimestamp(int64_t seconds) const { // Spark internally use microsecond precision for timestamp. // To avoid overflow, we need to check the range of seconds. - static constexpr int64_t maxSeconds = std::numeric_limits::max() / - (Timestamp::kMicrosecondsInMillisecond * - Timestamp::kMillisecondsInSecond); + static constexpr int64_t maxSeconds = + std::numeric_limits::max() / Timestamp::kMicrosecondsInSecond; if (seconds > maxSeconds) { return Timestamp::fromMicrosNoError(std::numeric_limits::max()); } @@ -41,6 +40,12 @@ Expected SparkCastHooks::castIntToTimestamp(int64_t seconds) const { return Timestamp(seconds, 0); } +Expected SparkCastHooks::castTimestampToInt( + Timestamp timestamp) const { + return std::floor( + timestamp.toMicros() / (Timestamp::kMicrosecondsInSecond * 1.0)); +} + Expected SparkCastHooks::castStringToDate( const StringView& dateString) const { // Allows all patterns supported by Spark: diff --git a/velox/functions/sparksql/specialforms/SparkCastHooks.h b/velox/functions/sparksql/specialforms/SparkCastHooks.h index c7d298a0ba4e..8e0e6d5c48a3 100644 --- a/velox/functions/sparksql/specialforms/SparkCastHooks.h +++ b/velox/functions/sparksql/specialforms/SparkCastHooks.h @@ -31,6 +31,8 @@ class SparkCastHooks : public exec::CastHooks { /// number of seconds since the epoch (1970-01-01 00:00:00 UTC). Expected castIntToTimestamp(int64_t seconds) const override; + Expected castTimestampToInt(Timestamp timestamp) const override; + /// 1) Removes all leading and trailing UTF8 white-spaces before cast. 2) Uses /// non-standard cast mode to cast from string to date. Expected castStringToDate( diff --git a/velox/functions/sparksql/tests/SparkCastExprTest.cpp b/velox/functions/sparksql/tests/SparkCastExprTest.cpp index 800710d053b8..304c143995a2 100644 --- a/velox/functions/sparksql/tests/SparkCastExprTest.cpp +++ b/velox/functions/sparksql/tests/SparkCastExprTest.cpp @@ -109,6 +109,24 @@ class SparkCastExprTest : public functions::test::CastBaseTest { Timestamp(std::numeric_limits::min(), 0), std::nullopt})); } + + template + void testTimestampToIntegralCast() { + testCast( + makeNullableFlatVector( + {Timestamp(0, 0), + Timestamp(1, 0), + Timestamp(std::numeric_limits::max(), 0), + Timestamp(std::numeric_limits::min(), 0), + std::nullopt}), + makeNullableFlatVector({ + 0, + 1, + std::numeric_limits::max(), + std::numeric_limits::min(), + std::nullopt, + })); + } }; TEST_F(SparkCastExprTest, date) { @@ -291,6 +309,34 @@ TEST_F(SparkCastExprTest, intToTimestamp) { testIntegralToTimestampCast(); } +TEST_F(SparkCastExprTest, timestampToInt) { + // Cast timestamp as bigint. + testCast( + makeNullableFlatVector({ + Timestamp(0, 0), + Timestamp(1727181032, 0), + Timestamp(-1727181032, 0), + Timestamp(9223372036854, 775'807'000), + Timestamp(-9223372036855, 224'192'000), + }), + makeNullableFlatVector({ + 0, + 1727181032, + -1727181032, + 9223372036854, + -9223372036855, + })); + testInvalidCast( + "bigint", + {Timestamp(9223372036856, 0)}, + "Could not convert Timestamp(9223372036856, 0) to microseconds"); + + // Cast timestamp as tinyint/smallint/integer. + testTimestampToIntegralCast(); + testTimestampToIntegralCast(); + testTimestampToIntegralCast(); +} + TEST_F(SparkCastExprTest, primitiveInvalidCornerCases) { // To integer. { diff --git a/velox/type/Timestamp.h b/velox/type/Timestamp.h index 88a31e72f62e..7cf6704b3705 100644 --- a/velox/type/Timestamp.h +++ b/velox/type/Timestamp.h @@ -81,6 +81,8 @@ struct Timestamp { public: static constexpr int64_t kMillisecondsInSecond = 1'000; static constexpr int64_t kMicrosecondsInMillisecond = 1'000; + static constexpr int64_t kMicrosecondsInSecond = + kMicrosecondsInMillisecond * kMillisecondsInSecond; static constexpr int64_t kNanosecondsInMicrosecond = 1'000; static constexpr int64_t kNanosecondsInMillisecond = 1'000'000; static constexpr int64_t kNanosInSecond = @@ -183,19 +185,21 @@ struct Timestamp { // Keep it in header for getting inlined. int64_t toMicros() const { - // When an integer overflow occurs in the calculation, - // an exception will be thrown. - try { - return checkedPlus( - checkedMultiply(seconds_, (int64_t)1'000'000), - (int64_t)(nanos_ / 1'000)); - } catch (const std::exception& e) { + // We use int128_t to make sure the computation does not overflows since + // there are cases such that seconds*1000000 does not fit in int64_t, + // but seconds*1000000 + nanos does, an example is TimeStamp::minMillis(). + + // If the final result does not fit in int64_tw we throw. + __int128_t result = + (__int128_t)seconds_ * 1'000'000 + (int64_t)(nanos_ / 1'000); + if (result < std::numeric_limits::min() || + result > std::numeric_limits::max()) { VELOX_USER_FAIL( - "Could not convert Timestamp({}, {}) to microseconds, {}", + "Could not convert Timestamp({}, {}) to microseconds", seconds_, - nanos_, - e.what()); + nanos_); } + return result; } /// Exports the current timestamp as a std::chrono::time_point of millisecond