Skip to content

Commit

Permalink
Enable partial date input support for from_iso8601_date() (facebookin…
Browse files Browse the repository at this point in the history
…cubator#9357)

Summary:
Also refactoring `castFromDateString()` function to take in `ParseMode` parameter rather than boolean, to improve flexibility and ease of use.

Pull Request resolved: facebookincubator#9357

Reviewed By: xiaoxmeng

Differential Revision: D56042071

Pulled By: mbasmanova

fbshipit-source-id: 61cb5ef5d72862c62a435a7b13585d7f6013edb0
  • Loading branch information
svm1 authored and facebook-github-bot committed Apr 13, 2024
1 parent 7a36070 commit 115a240
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 99 deletions.
5 changes: 3 additions & 2 deletions velox/connectors/hive/SplitReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "velox/connectors/hive/iceberg/IcebergSplitReader.h"
#include "velox/dwio/common/CachedBufferedInput.h"
#include "velox/dwio/common/ReaderFactory.h"
#include "velox/type/TimestampConversion.h"

namespace facebook::velox::connector::hive {
namespace {
Expand All @@ -39,8 +40,8 @@ VectorPtr newConstantFromString(
}

if (type->isDate()) {
auto copy =
util::castFromDateString(StringView(value.value()), true /*isIso8601*/);
auto copy = util::castFromDateString(
StringView(value.value()), util::ParseMode::kStandardCast);
return std::make_shared<ConstantVector<int32_t>>(
pool, size, false, type, std::move(copy));
}
Expand Down
19 changes: 19 additions & 0 deletions velox/docs/functions/presto/datetime.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,25 @@ Date and Time Functions
.. function:: from_iso8601_date(string) -> date

Parses the ISO 8601 formatted ``string`` into a ``date``.
ISO 8601 ``string`` can be formatted as any of the following:
``[+-][Y]Y*``

``[+-][Y]Y*-[M]M*``

``[+-][Y]Y*-[M]M*-[D]D*``

``[+-][Y]Y*-[M]M*-[D]D* *``

Year value must contain at least one digit, and may contain up to six digits.
Month and day values are optional and may each contain one or two digits.

Examples of supported input strings:
"2012",
"2012-4",
"2012-04",
"2012-4-7",
"2012-04-07",
"2012-04-07 ”

.. function:: from_unixtime(unixtime) -> timestamp

Expand Down
5 changes: 3 additions & 2 deletions velox/expression/PrestoCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "velox/expression/PrestoCastHooks.h"
#include "velox/external/date/tz.h"
#include "velox/type/TimestampConversion.h"

namespace facebook::velox::exec {

Expand All @@ -36,9 +37,9 @@ Timestamp PrestoCastHooks::castStringToTimestamp(const StringView& view) const {
}

int32_t PrestoCastHooks::castStringToDate(const StringView& dateString) const {
// Cast from string to date allows only ISO 8601 formatted strings:
// Cast from string to date allows only complete ISO 8601 formatted strings:
// [+-](YYYY-MM-DD).
return util::castFromDateString(dateString, true /*isIso8601*/);
return util::castFromDateString(dateString, util::ParseMode::kStandardCast);
}

bool PrestoCastHooks::legacy() const {
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -1271,7 +1271,8 @@ struct FromIso8601Date {
FOLLY_ALWAYS_INLINE void call(
out_type<Date>& result,
const arg_type<Varchar>& input) {
result = util::fromDateString(input.data(), input.size());
result = util::castFromDateString(
input.data(), input.size(), util::ParseMode::kNonStandardNoTimeCast);
}
};

Expand Down
13 changes: 13 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3409,8 +3409,21 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Date) {
EXPECT_EQ(0, fromIso("1970-01-01"));
EXPECT_EQ(9, fromIso("1970-01-10"));
EXPECT_EQ(-1, fromIso("1969-12-31"));
EXPECT_EQ(0, fromIso("1970"));
EXPECT_EQ(0, fromIso("1970-01"));
EXPECT_EQ(0, fromIso("1970-1"));
EXPECT_EQ(8, fromIso("1970-1-9"));
EXPECT_EQ(-31, fromIso("1969-12"));
EXPECT_EQ(-31, fromIso("1969-12-1"));
EXPECT_EQ(-31, fromIso("1969-12-01"));
EXPECT_EQ(-31, fromIso(" 1969-12-01 "));
EXPECT_EQ(-719862, fromIso("-1-2-1"));

VELOX_ASSERT_THROW(fromIso("2024-01-xx"), "Unable to parse date value");
VELOX_ASSERT_THROW(
fromIso("2024-01-02T12:31:00"), "Unable to parse date value");
VELOX_ASSERT_THROW(
fromIso("2024-01-02 12:31:00"), "Unable to parse date value");
}

TEST_F(DateTimeFunctionsTest, dateParse) {
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/sparksql/specialforms/SparkCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "velox/functions/sparksql/specialforms/SparkCastHooks.h"
#include "velox/functions/lib/string/StringImpl.h"
#include "velox/type/TimestampConversion.h"

namespace facebook::velox::functions::sparksql {

Expand All @@ -36,7 +37,7 @@ int32_t SparkCastHooks::castStringToDate(const StringView& dateString) const {
// "1970-01-01 123"
// "1970-01-01 (BC)"
return util::castFromDateString(
removeWhiteSpaces(dateString), false /*isIso8601*/);
removeWhiteSpaces(dateString), util::ParseMode::kNonStandardCast);
}

bool SparkCastHooks::legacy() const {
Expand Down
70 changes: 35 additions & 35 deletions velox/type/TimestampConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,6 @@ constexpr int32_t kCumulativeYearDays[] = {

namespace {

// Enum to dictate parsing modes for date strings.
//
// kStrict: For date string conversion, align with DuckDB's implementation.
//
// kNonStrict: For timestamp string conversion, align with DuckDB's
// implementation.
//
// kStandardCast: Strictly processes dates in the [+-](YYYY-MM-DD) format.
// Align with Presto casting conventions.
//
// kNonStandardCast: Like standard but permits missing day/month and allows
// trailing 'T' or spaces. Align with Spark SQL casting conventions.
enum class ParseMode { kStrict, kNonStrict, kStandardCast, kNonStandardCast };

inline bool characterIsSpace(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' ||
c == '\r';
Expand Down Expand Up @@ -217,7 +203,9 @@ bool tryParseDateString(
}

// No month or day.
if (mode == ParseMode::kNonStandardCast && pos == len) {
if ((mode == ParseMode::kNonStandardCast ||
mode == ParseMode::kNonStandardNoTimeCast) &&
pos == len) {
if (!daysSinceEpochFromDate(year, 1, 1, daysSinceEpoch).ok()) {
return false;
}
Expand All @@ -230,7 +218,8 @@ bool tryParseDateString(

// Fetch the separator.
sep = buf[pos++];
if (mode == ParseMode::kStandardCast || mode == ParseMode::kNonStandardCast) {
if (mode == ParseMode::kStandardCast || mode == ParseMode::kNonStandardCast ||
mode == ParseMode::kNonStandardNoTimeCast) {
// Only '-' is valid for cast.
if (sep != '-') {
return false;
Expand All @@ -248,7 +237,9 @@ bool tryParseDateString(
}

// No day.
if (mode == ParseMode::kNonStandardCast && pos == len) {
if ((mode == ParseMode::kNonStandardCast ||
mode == ParseMode::kNonStandardNoTimeCast) &&
pos == len) {
if (!daysSinceEpochFromDate(year, month, 1, daysSinceEpoch).ok()) {
return false;
}
Expand Down Expand Up @@ -319,7 +310,7 @@ bool tryParseDateString(
}

// In strict mode, check remaining string for non-space characters.
if (mode == ParseMode::kStrict) {
if (mode == ParseMode::kStrict || mode == ParseMode::kNonStandardNoTimeCast) {
// Skip trailing spaces.
while (pos < len && characterIsSpace(buf[pos])) {
pos++;
Expand Down Expand Up @@ -605,27 +596,36 @@ int64_t fromDateString(const char* str, size_t len) {
return daysSinceEpoch;
}

int32_t castFromDateString(const char* str, size_t len, bool isIso8601) {
int32_t castFromDateString(const char* str, size_t len, ParseMode mode) {
int64_t daysSinceEpoch;
size_t pos = 0;

auto mode =
isIso8601 ? ParseMode::kStandardCast : ParseMode::kNonStandardCast;
if (!tryParseDateString(str, len, pos, daysSinceEpoch, mode)) {
if (isIso8601) {
VELOX_USER_FAIL(
"Unable to parse date value: \"{}\"."
"Valid date string pattern is (YYYY-MM-DD), "
"and can be prefixed with [+-]",
std::string(str, len));
} else {
VELOX_USER_FAIL(
"Unable to parse date value: \"{}\"."
"Valid date string patterns include "
"(yyyy*, yyyy*-[m]m, yyyy*-[m]m-[d]d, "
"yyyy*-[m]m-[d]d *, yyyy*-[m]m-[d]dT*), "
"and any pattern prefixed with [+-]",
std::string(str, len));
switch (mode) {
case ParseMode::kStandardCast:
VELOX_USER_FAIL(
"Unable to parse date value: \"{}\". "
"Valid date string pattern is (YYYY-MM-DD), "
"and can be prefixed with [+-]",
std::string(str, len));
case ParseMode::kNonStandardCast:
VELOX_USER_FAIL(
"Unable to parse date value: \"{}\". "
"Valid date string patterns include "
"([y]y*, [y]y*-[m]m*, [y]y*-[m]m*-[d]d*, "
"[y]y*-[m]m*-[d]d* *, [y]y*-[m]m*-[d]d*T*), "
"and any pattern prefixed with [+-]",
std::string(str, len));
case ParseMode::kNonStandardNoTimeCast:
VELOX_USER_FAIL(
"Unable to parse date value: \"{}\". "
"Valid date string patterns include "
"([y]y*, [y]y*-[m]m*, [y]y*-[m]m*-[d]d*, "
"[y]y*-[m]m*-[d]d* *), "
"and any pattern prefixed with [+-]",
std::string(str, len));
default:
VELOX_UNREACHABLE();
}
}
return daysSinceEpoch;
Expand Down
51 changes: 38 additions & 13 deletions velox/type/TimestampConversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,39 @@ constexpr const int32_t kMaxYear{292278994};
constexpr const int32_t kYearInterval{400};
constexpr const int32_t kDaysPerYearInterval{146097};

/// Enum to dictate parsing modes for date strings.
enum class ParseMode {
// For date string conversion, align with DuckDB's implementation.
kStrict,

// For timestamp string conversion, align with DuckDB's implementation.
kNonStrict,

// Strictly processes dates only in complete ISO 8601 format,
// e.g. [+-](YYYY-MM-DD).
// Align with Presto casting conventions.
kStandardCast,

// Like kStandardCast but permits years less than four digits, missing
// day/month, and allows trailing 'T' or spaces.
// Align with Spark SQL casting conventions.
// Supported formats:
// `[+-][Y]Y*`
// `[+-][Y]Y*-[M]M`
// `[+-][Y]Y*-[M]M*-[D]D`
// `[+-][Y]Y*-[M]M*-[D]D *`
// `[+-][Y]Y*-[M]M*-[D]DT*`
kNonStandardCast,

// Like kNonStandardCast but does not permit inclusion of timestamp.
// Supported formats:
// `[+-][Y]Y*`
// `[+-][Y]Y*-[M]M`
// `[+-][Y]Y*-[M]M*-[D]D`
// `[+-][Y]Y*-[M]M*-[D]D *`
kNonStandardNoTimeCast
};

// Returns true if leap year, false otherwise
bool isLeapYear(int32_t year);

Expand Down Expand Up @@ -91,22 +124,14 @@ inline int64_t fromDateString(const StringView& str) {
return fromDateString(str.data(), str.size());
}

/// Cast string to date.
/// When isIso8601 = true, only support "[+-]YYYY-MM-DD" format (ISO 8601).
/// When isIso8601 = false, supported date formats include:
///
/// `[+-]YYYY*`
/// `[+-]YYYY*-[M]M`
/// `[+-]YYYY*-[M]M-[D]D`
/// `[+-]YYYY*-[M]M-[D]D `
/// `[+-]YYYY*-[M]M-[D]D *`
/// `[+-]YYYY*-[M]M-[D]DT*`
/// Cast string to date. Supported date formats vary, depending on input
/// ParseMode. Refer to ParseMode enum for further info.
///
/// Throws VeloxUserError if the format or date is invalid.
int32_t castFromDateString(const char* buf, size_t len, bool isIso8601);
int32_t castFromDateString(const char* buf, size_t len, ParseMode mode);

inline int32_t castFromDateString(const StringView& str, bool isIso8601) {
return castFromDateString(str.data(), str.size(), isIso8601);
inline int32_t castFromDateString(const StringView& str, ParseMode mode) {
return castFromDateString(str.data(), str.size(), mode);
}

// Extracts the day of the week from the number of days since epoch
Expand Down
Loading

0 comments on commit 115a240

Please sign in to comment.