diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 5355a073b114..c1cd9082c355 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -24,6 +24,7 @@ #include "velox/connectors/hive/iceberg/IcebergSplitReader.h" #include "velox/dwio/common/CachedBufferedInput.h" #include "velox/dwio/common/ReaderFactory.h" +#include "velox/type/TimestampConversion.h" namespace facebook::velox::connector::hive { namespace { @@ -39,8 +40,8 @@ VectorPtr newConstantFromString( } if (type->isDate()) { - auto copy = - util::castFromDateString(StringView(value.value()), true /*isIso8601*/); + auto copy = util::castFromDateString( + StringView(value.value()), util::ParseMode::kStandardCast); return std::make_shared>( pool, size, false, type, std::move(copy)); } diff --git a/velox/docs/functions/presto/datetime.rst b/velox/docs/functions/presto/datetime.rst index abf1377a2fdc..349eb58ec214 100644 --- a/velox/docs/functions/presto/datetime.rst +++ b/velox/docs/functions/presto/datetime.rst @@ -95,6 +95,25 @@ Date and Time Functions .. function:: from_iso8601_date(string) -> date Parses the ISO 8601 formatted ``string`` into a ``date``. + ISO 8601 ``string`` can be formatted as any of the following: + ``[+-][Y]Y*`` + + ``[+-][Y]Y*-[M]M*`` + + ``[+-][Y]Y*-[M]M*-[D]D*`` + + ``[+-][Y]Y*-[M]M*-[D]D* *`` + + Year value must contain at least one digit, and may contain up to six digits. + Month and day values are optional and may each contain one or two digits. + + Examples of supported input strings: + "2012", + "2012-4", + "2012-04", + "2012-4-7", + "2012-04-07", + "2012-04-07 ” .. function:: from_unixtime(unixtime) -> timestamp diff --git a/velox/expression/PrestoCastHooks.cpp b/velox/expression/PrestoCastHooks.cpp index 1fa866650c80..37d1357cf623 100644 --- a/velox/expression/PrestoCastHooks.cpp +++ b/velox/expression/PrestoCastHooks.cpp @@ -16,6 +16,7 @@ #include "velox/expression/PrestoCastHooks.h" #include "velox/external/date/tz.h" +#include "velox/type/TimestampConversion.h" namespace facebook::velox::exec { @@ -36,9 +37,9 @@ Timestamp PrestoCastHooks::castStringToTimestamp(const StringView& view) const { } int32_t PrestoCastHooks::castStringToDate(const StringView& dateString) const { - // Cast from string to date allows only ISO 8601 formatted strings: + // Cast from string to date allows only complete ISO 8601 formatted strings: // [+-](YYYY-MM-DD). - return util::castFromDateString(dateString, true /*isIso8601*/); + return util::castFromDateString(dateString, util::ParseMode::kStandardCast); } bool PrestoCastHooks::legacy() const { diff --git a/velox/functions/prestosql/DateTimeFunctions.h b/velox/functions/prestosql/DateTimeFunctions.h index a41f16ee10d7..3e0c329c931e 100644 --- a/velox/functions/prestosql/DateTimeFunctions.h +++ b/velox/functions/prestosql/DateTimeFunctions.h @@ -1271,7 +1271,8 @@ struct FromIso8601Date { FOLLY_ALWAYS_INLINE void call( out_type& result, const arg_type& input) { - result = util::fromDateString(input.data(), input.size()); + result = util::castFromDateString( + input.data(), input.size(), util::ParseMode::kNonStandardNoTimeCast); } }; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index ed473efe6c75..3068d36b6e31 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -3409,8 +3409,21 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Date) { EXPECT_EQ(0, fromIso("1970-01-01")); EXPECT_EQ(9, fromIso("1970-01-10")); EXPECT_EQ(-1, fromIso("1969-12-31")); + EXPECT_EQ(0, fromIso("1970")); + EXPECT_EQ(0, fromIso("1970-01")); + EXPECT_EQ(0, fromIso("1970-1")); + EXPECT_EQ(8, fromIso("1970-1-9")); + EXPECT_EQ(-31, fromIso("1969-12")); + EXPECT_EQ(-31, fromIso("1969-12-1")); + EXPECT_EQ(-31, fromIso("1969-12-01")); + EXPECT_EQ(-31, fromIso(" 1969-12-01 ")); + EXPECT_EQ(-719862, fromIso("-1-2-1")); VELOX_ASSERT_THROW(fromIso("2024-01-xx"), "Unable to parse date value"); + VELOX_ASSERT_THROW( + fromIso("2024-01-02T12:31:00"), "Unable to parse date value"); + VELOX_ASSERT_THROW( + fromIso("2024-01-02 12:31:00"), "Unable to parse date value"); } TEST_F(DateTimeFunctionsTest, dateParse) { diff --git a/velox/functions/sparksql/specialforms/SparkCastHooks.cpp b/velox/functions/sparksql/specialforms/SparkCastHooks.cpp index ab7f37201e64..6a1b7c5ea326 100644 --- a/velox/functions/sparksql/specialforms/SparkCastHooks.cpp +++ b/velox/functions/sparksql/specialforms/SparkCastHooks.cpp @@ -16,6 +16,7 @@ #include "velox/functions/sparksql/specialforms/SparkCastHooks.h" #include "velox/functions/lib/string/StringImpl.h" +#include "velox/type/TimestampConversion.h" namespace facebook::velox::functions::sparksql { @@ -36,7 +37,7 @@ int32_t SparkCastHooks::castStringToDate(const StringView& dateString) const { // "1970-01-01 123" // "1970-01-01 (BC)" return util::castFromDateString( - removeWhiteSpaces(dateString), false /*isIso8601*/); + removeWhiteSpaces(dateString), util::ParseMode::kNonStandardCast); } bool SparkCastHooks::legacy() const { diff --git a/velox/type/TimestampConversion.cpp b/velox/type/TimestampConversion.cpp index e3ab6885e609..93e65e1779c8 100644 --- a/velox/type/TimestampConversion.cpp +++ b/velox/type/TimestampConversion.cpp @@ -105,20 +105,6 @@ constexpr int32_t kCumulativeYearDays[] = { namespace { -// Enum to dictate parsing modes for date strings. -// -// kStrict: For date string conversion, align with DuckDB's implementation. -// -// kNonStrict: For timestamp string conversion, align with DuckDB's -// implementation. -// -// kStandardCast: Strictly processes dates in the [+-](YYYY-MM-DD) format. -// Align with Presto casting conventions. -// -// kNonStandardCast: Like standard but permits missing day/month and allows -// trailing 'T' or spaces. Align with Spark SQL casting conventions. -enum class ParseMode { kStrict, kNonStrict, kStandardCast, kNonStandardCast }; - inline bool characterIsSpace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; @@ -217,7 +203,9 @@ bool tryParseDateString( } // No month or day. - if (mode == ParseMode::kNonStandardCast && pos == len) { + if ((mode == ParseMode::kNonStandardCast || + mode == ParseMode::kNonStandardNoTimeCast) && + pos == len) { if (!daysSinceEpochFromDate(year, 1, 1, daysSinceEpoch).ok()) { return false; } @@ -230,7 +218,8 @@ bool tryParseDateString( // Fetch the separator. sep = buf[pos++]; - if (mode == ParseMode::kStandardCast || mode == ParseMode::kNonStandardCast) { + if (mode == ParseMode::kStandardCast || mode == ParseMode::kNonStandardCast || + mode == ParseMode::kNonStandardNoTimeCast) { // Only '-' is valid for cast. if (sep != '-') { return false; @@ -248,7 +237,9 @@ bool tryParseDateString( } // No day. - if (mode == ParseMode::kNonStandardCast && pos == len) { + if ((mode == ParseMode::kNonStandardCast || + mode == ParseMode::kNonStandardNoTimeCast) && + pos == len) { if (!daysSinceEpochFromDate(year, month, 1, daysSinceEpoch).ok()) { return false; } @@ -319,7 +310,7 @@ bool tryParseDateString( } // In strict mode, check remaining string for non-space characters. - if (mode == ParseMode::kStrict) { + if (mode == ParseMode::kStrict || mode == ParseMode::kNonStandardNoTimeCast) { // Skip trailing spaces. while (pos < len && characterIsSpace(buf[pos])) { pos++; @@ -605,27 +596,36 @@ int64_t fromDateString(const char* str, size_t len) { return daysSinceEpoch; } -int32_t castFromDateString(const char* str, size_t len, bool isIso8601) { +int32_t castFromDateString(const char* str, size_t len, ParseMode mode) { int64_t daysSinceEpoch; size_t pos = 0; - auto mode = - isIso8601 ? ParseMode::kStandardCast : ParseMode::kNonStandardCast; if (!tryParseDateString(str, len, pos, daysSinceEpoch, mode)) { - if (isIso8601) { - VELOX_USER_FAIL( - "Unable to parse date value: \"{}\"." - "Valid date string pattern is (YYYY-MM-DD), " - "and can be prefixed with [+-]", - std::string(str, len)); - } else { - VELOX_USER_FAIL( - "Unable to parse date value: \"{}\"." - "Valid date string patterns include " - "(yyyy*, yyyy*-[m]m, yyyy*-[m]m-[d]d, " - "yyyy*-[m]m-[d]d *, yyyy*-[m]m-[d]dT*), " - "and any pattern prefixed with [+-]", - std::string(str, len)); + switch (mode) { + case ParseMode::kStandardCast: + VELOX_USER_FAIL( + "Unable to parse date value: \"{}\". " + "Valid date string pattern is (YYYY-MM-DD), " + "and can be prefixed with [+-]", + std::string(str, len)); + case ParseMode::kNonStandardCast: + VELOX_USER_FAIL( + "Unable to parse date value: \"{}\". " + "Valid date string patterns include " + "([y]y*, [y]y*-[m]m*, [y]y*-[m]m*-[d]d*, " + "[y]y*-[m]m*-[d]d* *, [y]y*-[m]m*-[d]d*T*), " + "and any pattern prefixed with [+-]", + std::string(str, len)); + case ParseMode::kNonStandardNoTimeCast: + VELOX_USER_FAIL( + "Unable to parse date value: \"{}\". " + "Valid date string patterns include " + "([y]y*, [y]y*-[m]m*, [y]y*-[m]m*-[d]d*, " + "[y]y*-[m]m*-[d]d* *), " + "and any pattern prefixed with [+-]", + std::string(str, len)); + default: + VELOX_UNREACHABLE(); } } return daysSinceEpoch; diff --git a/velox/type/TimestampConversion.h b/velox/type/TimestampConversion.h index 9bae1aa632ea..c6bee1f3fb69 100644 --- a/velox/type/TimestampConversion.h +++ b/velox/type/TimestampConversion.h @@ -44,6 +44,39 @@ constexpr const int32_t kMaxYear{292278994}; constexpr const int32_t kYearInterval{400}; constexpr const int32_t kDaysPerYearInterval{146097}; +/// Enum to dictate parsing modes for date strings. +enum class ParseMode { + // For date string conversion, align with DuckDB's implementation. + kStrict, + + // For timestamp string conversion, align with DuckDB's implementation. + kNonStrict, + + // Strictly processes dates only in complete ISO 8601 format, + // e.g. [+-](YYYY-MM-DD). + // Align with Presto casting conventions. + kStandardCast, + + // Like kStandardCast but permits years less than four digits, missing + // day/month, and allows trailing 'T' or spaces. + // Align with Spark SQL casting conventions. + // Supported formats: + // `[+-][Y]Y*` + // `[+-][Y]Y*-[M]M` + // `[+-][Y]Y*-[M]M*-[D]D` + // `[+-][Y]Y*-[M]M*-[D]D *` + // `[+-][Y]Y*-[M]M*-[D]DT*` + kNonStandardCast, + + // Like kNonStandardCast but does not permit inclusion of timestamp. + // Supported formats: + // `[+-][Y]Y*` + // `[+-][Y]Y*-[M]M` + // `[+-][Y]Y*-[M]M*-[D]D` + // `[+-][Y]Y*-[M]M*-[D]D *` + kNonStandardNoTimeCast +}; + // Returns true if leap year, false otherwise bool isLeapYear(int32_t year); @@ -91,22 +124,14 @@ inline int64_t fromDateString(const StringView& str) { return fromDateString(str.data(), str.size()); } -/// Cast string to date. -/// When isIso8601 = true, only support "[+-]YYYY-MM-DD" format (ISO 8601). -/// When isIso8601 = false, supported date formats include: -/// -/// `[+-]YYYY*` -/// `[+-]YYYY*-[M]M` -/// `[+-]YYYY*-[M]M-[D]D` -/// `[+-]YYYY*-[M]M-[D]D ` -/// `[+-]YYYY*-[M]M-[D]D *` -/// `[+-]YYYY*-[M]M-[D]DT*` +/// Cast string to date. Supported date formats vary, depending on input +/// ParseMode. Refer to ParseMode enum for further info. /// /// Throws VeloxUserError if the format or date is invalid. -int32_t castFromDateString(const char* buf, size_t len, bool isIso8601); +int32_t castFromDateString(const char* buf, size_t len, ParseMode mode); -inline int32_t castFromDateString(const StringView& str, bool isIso8601) { - return castFromDateString(str.data(), str.size(), isIso8601); +inline int32_t castFromDateString(const StringView& str, ParseMode mode) { + return castFromDateString(str.data(), str.size(), mode); } // Extracts the day of the week from the number of days since epoch diff --git a/velox/type/tests/TimestampConversionTest.cpp b/velox/type/tests/TimestampConversionTest.cpp index 459d702640e3..2264299a65f1 100644 --- a/velox/type/tests/TimestampConversionTest.cpp +++ b/velox/type/tests/TimestampConversionTest.cpp @@ -132,76 +132,99 @@ TEST(DateTimeUtilTest, fromDateStrInvalid) { } TEST(DateTimeUtilTest, castFromDateString) { - for (bool isIso8601 : {true, false}) { - EXPECT_EQ(0, castFromDateString("1970-01-01", isIso8601)); - EXPECT_EQ(3789742, castFromDateString("12345-12-18", isIso8601)); + for (ParseMode mode : + {ParseMode::kStandardCast, ParseMode::kNonStandardCast}) { + EXPECT_EQ(0, castFromDateString("1970-01-01", mode)); + EXPECT_EQ(3789742, castFromDateString("12345-12-18", mode)); - EXPECT_EQ(1, castFromDateString("1970-1-2", isIso8601)); - EXPECT_EQ(1, castFromDateString("1970-01-2", isIso8601)); - EXPECT_EQ(1, castFromDateString("1970-1-02", isIso8601)); + EXPECT_EQ(1, castFromDateString("1970-1-2", mode)); + EXPECT_EQ(1, castFromDateString("1970-01-2", mode)); + EXPECT_EQ(1, castFromDateString("1970-1-02", mode)); - EXPECT_EQ(1, castFromDateString("+1970-01-02", isIso8601)); - EXPECT_EQ(-719893, castFromDateString("-1-1-1", isIso8601)); + EXPECT_EQ(1, castFromDateString("+1970-01-02", mode)); + EXPECT_EQ(-719893, castFromDateString("-1-1-1", mode)); - EXPECT_EQ(0, castFromDateString(" 1970-01-01", isIso8601)); + EXPECT_EQ(0, castFromDateString(" 1970-01-01", mode)); } - EXPECT_EQ(3789391, castFromDateString("12345", false)); - EXPECT_EQ(16436, castFromDateString("2015", false)); - EXPECT_EQ(16495, castFromDateString("2015-03", false)); - EXPECT_EQ(16512, castFromDateString("2015-03-18T", false)); - EXPECT_EQ(16512, castFromDateString("2015-03-18T123123", false)); - EXPECT_EQ(16512, castFromDateString("2015-03-18 123142", false)); - EXPECT_EQ(16512, castFromDateString("2015-03-18 (BC)", false)); + EXPECT_EQ(3789391, castFromDateString("12345", ParseMode::kNonStandardCast)); + EXPECT_EQ(16436, castFromDateString("2015", ParseMode::kNonStandardCast)); + EXPECT_EQ(16495, castFromDateString("2015-03", ParseMode::kNonStandardCast)); + EXPECT_EQ( + 16512, castFromDateString("2015-03-18T", ParseMode::kNonStandardCast)); + EXPECT_EQ( + 16512, + castFromDateString("2015-03-18T123123", ParseMode::kNonStandardCast)); + EXPECT_EQ( + 16512, + castFromDateString("2015-03-18 123142", ParseMode::kNonStandardCast)); + EXPECT_EQ( + 16512, + castFromDateString("2015-03-18 (BC)", ParseMode::kNonStandardCast)); - EXPECT_EQ(0, castFromDateString("1970-01-01 ", false)); - EXPECT_EQ(0, castFromDateString(" 1970-01-01 ", false)); + EXPECT_EQ(0, castFromDateString("1970-01-01 ", ParseMode::kNonStandardCast)); + EXPECT_EQ(0, castFromDateString(" 1970-01-01 ", ParseMode::kNonStandardCast)); } TEST(DateTimeUtilTest, castFromDateStringInvalid) { auto testCastFromDateStringInvalid = [&](const StringView& str, - bool isIso8601) { - if (isIso8601) { + ParseMode mode) { + if (mode == ParseMode::kStandardCast) { VELOX_ASSERT_THROW( - castFromDateString(str, isIso8601), + castFromDateString(str, mode), fmt::format( - "Unable to parse date value: \"{}\"." + "Unable to parse date value: \"{}\". " "Valid date string pattern is (YYYY-MM-DD), " "and can be prefixed with [+-]", std::string(str.data(), str.size()))); - } else { + } else if (mode == ParseMode::kNonStandardCast) { + VELOX_ASSERT_THROW( + castFromDateString(str, mode), + fmt::format( + "Unable to parse date value: \"{}\". " + "Valid date string patterns include " + "([y]y*, [y]y*-[m]m*, [y]y*-[m]m*-[d]d*, " + "[y]y*-[m]m*-[d]d* *, [y]y*-[m]m*-[d]d*T*), " + "and any pattern prefixed with [+-]", + std::string(str.data(), str.size()))); + } else if (mode == ParseMode::kNonStandardNoTimeCast) { VELOX_ASSERT_THROW( - castFromDateString(str, isIso8601), + castFromDateString(str, mode), fmt::format( - "Unable to parse date value: \"{}\"." + "Unable to parse date value: \"{}\". " "Valid date string patterns include " - "(yyyy*, yyyy*-[m]m, yyyy*-[m]m-[d]d, " - "yyyy*-[m]m-[d]d *, yyyy*-[m]m-[d]dT*), " + "([y]y*, [y]y*-[m]m*, [y]y*-[m]m*-[d]d*, " + "[y]y*-[m]m*-[d]d* *), " "and any pattern prefixed with [+-]", std::string(str.data(), str.size()))); } }; - for (bool isIso8601 : {true, false}) { - testCastFromDateStringInvalid("2012-Oct-23", isIso8601); - testCastFromDateStringInvalid("2012-Oct-23", isIso8601); - testCastFromDateStringInvalid("2015-03-18X", isIso8601); - testCastFromDateStringInvalid("2015/03/18", isIso8601); - testCastFromDateStringInvalid("2015.03.18", isIso8601); - testCastFromDateStringInvalid("20150318", isIso8601); - testCastFromDateStringInvalid("2015-031-8", isIso8601); + for (ParseMode mode : + {ParseMode::kStandardCast, ParseMode::kNonStandardCast}) { + testCastFromDateStringInvalid("2012-Oct-23", mode); + testCastFromDateStringInvalid("2012-Oct-23", mode); + testCastFromDateStringInvalid("2015-03-18X", mode); + testCastFromDateStringInvalid("2015/03/18", mode); + testCastFromDateStringInvalid("2015.03.18", mode); + testCastFromDateStringInvalid("20150318", mode); + testCastFromDateStringInvalid("2015-031-8", mode); } - testCastFromDateStringInvalid("12345", true); - testCastFromDateStringInvalid("2015", true); - testCastFromDateStringInvalid("2015-03", true); - testCastFromDateStringInvalid("2015-03-18 123412", true); - testCastFromDateStringInvalid("2015-03-18T", true); - testCastFromDateStringInvalid("2015-03-18T123412", true); - testCastFromDateStringInvalid("2015-03-18 (BC)", true); - - testCastFromDateStringInvalid("1970-01-01 ", true); - testCastFromDateStringInvalid(" 1970-01-01 ", true); + testCastFromDateStringInvalid("12345", ParseMode::kStrict); + testCastFromDateStringInvalid("2015", ParseMode::kStrict); + testCastFromDateStringInvalid("2015-03", ParseMode::kStrict); + testCastFromDateStringInvalid("2015-03-18 123412", ParseMode::kStrict); + testCastFromDateStringInvalid("2015-03-18T", ParseMode::kStrict); + testCastFromDateStringInvalid("2015-03-18T123412", ParseMode::kStrict); + testCastFromDateStringInvalid("2015-03-18 (BC)", ParseMode::kStrict); + testCastFromDateStringInvalid("1970-01-01 ", ParseMode::kStrict); + testCastFromDateStringInvalid(" 1970-01-01 ", ParseMode::kStrict); + + testCastFromDateStringInvalid( + "1970-01-01T01:00:47", ParseMode::kNonStandardNoTimeCast); + testCastFromDateStringInvalid( + "1970-01-01T01:00:47.000", ParseMode::kNonStandardNoTimeCast); } TEST(DateTimeUtilTest, fromTimeString) {