Skip to content

Commit

Permalink
Init weekyear support of SimpleDateFormat
Browse files Browse the repository at this point in the history
  • Loading branch information
ccat3z committed Oct 16, 2024
1 parent 4210a9a commit 174b4bc
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 50 deletions.
32 changes: 32 additions & 0 deletions velox/core/QueryConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,22 @@ class QueryConfig {
static constexpr const char* kSparkLegacyDateFormatter =
"spark.legacy_date_formatter";

/// The first day-of-week varies by culture.
/// firstDayOfWeek is is a 1-based weekday number starting with Sunday. It
/// determines how week-based calendar works. For example, the ISO-8601 use
/// Monday (2) and the US uses Sunday (1). It should be set to match the
/// 'Calender.getFirstDayOfWeek()' in Java. Sunday (1) is used by default.
static constexpr const char* kSparkFirstDayOfWeek =
"spark.legacy_date_formatter.first_day_of_week";

/// The minimal number of days in the first week by culture.
/// The week that includes January 1st and has 'minimalDaysInFirstWeek' or
/// more days is referred to as week 1. It determines how week-based calendar
/// works. It should be set to match the
/// 'Calender.getMinimalDaysInFirstWeek()' in Java. 1 days is used by default.
static constexpr const char* kSparkMinimalDaysInFirstWeek =
"spark.legacy_date_formatter.minimal_days_in_first_week";

/// The number of local parallel table writer operators per task.
static constexpr const char* kTaskWriterCount = "task_writer_count";

Expand Down Expand Up @@ -759,6 +775,22 @@ class QueryConfig {
return get<bool>(kSparkLegacyDateFormatter, false);
}

uint8_t sparkFirstDayOfWeek() const {
auto value = get<uint32_t>(kSparkFirstDayOfWeek, 1);
VELOX_CHECK(
1 <= value && value <= 7,
"firstDayOfWeek must be a number between 1 and 7");
return static_cast<uint8_t>(value);
}

uint8_t sparkMinimalDaysInFirstWeek() const {
auto value = get<uint32_t>(kSparkMinimalDaysInFirstWeek, 1);
VELOX_CHECK(
1 <= value && value <= 7,
"minimalDaysInFirstWee must be a number between 1 and 7");
return static_cast<uint8_t>(value);
}

bool exprTrackCpuUsage() const {
return get<bool>(kExprTrackCpuUsage, false);
}
Expand Down
10 changes: 3 additions & 7 deletions velox/functions/lib/DateTimeFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1334,9 +1334,8 @@ int32_t DateTimeFormatter::format(
static_cast<int>(calDate.year()),
static_cast<uint32_t>(calDate.month()),
static_cast<uint32_t>(calDate.day()),
2, // (ISO 8601) Monday = 2
4 // At least 4 days in first week
);
firstDayOfWeek_,
minimalDaysInFirstWeek_);

result += padContent(
static_cast<signed>(year),
Expand Down Expand Up @@ -1839,14 +1838,11 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
case 'w':
builder.appendWeekOfWeekYear(count);
break;
case 'x':
builder.appendWeekYear(count);
break;
case 'y':
builder.appendYear(count);
break;
case 'Y':
builder.appendYearOfEra(count);
builder.appendWeekYear(count);
break;
case 'z':
builder.appendTimeZone(count);
Expand Down
20 changes: 20 additions & 0 deletions velox/functions/lib/DateTimeFormatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,31 @@ class DateTimeFormatter {
char* result,
bool allowOverflow = false) const;

void setFirstDayOfWeek(uint8_t firstDayOfWeek) {
firstDayOfWeek_ = firstDayOfWeek;
}

void setMinimalDaysInFirstWeek(uint8_t minimalDaysInFirstWeek) {
minimalDaysInFirstWeek_ = minimalDaysInFirstWeek;
}

private:
std::unique_ptr<char[]> literalBuf_;
size_t bufSize_;
std::vector<DateTimeToken> tokens_;
DateTimeFormatterType type_;

/// The first day-of-week varies by culture.
/// firstDayOfWeek is is a 1-based weekday number starting with Sunday. It
/// determines how week-based calendar works. For example, the ISO-8601 use
/// Monday (2) and the US uses Sunday (1).
uint8_t firstDayOfWeek_ = 2;

/// The minimal number of days in the first week by culture.
/// The week that includes January 1st and has 'minimalDaysInFirstWeek' or
/// more days is referred to as week 1. It determines how week-based calendar
/// works. For example, the ISO-8601 use 4 days.
uint8_t minimalDaysInFirstWeek_ = 4;
};

Expected<std::shared_ptr<DateTimeFormatter>> buildMysqlDateTimeFormatter(
Expand Down
87 changes: 44 additions & 43 deletions velox/functions/sparksql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,32 @@
namespace facebook::velox::functions::sparksql {

namespace detail {
Expected<std::shared_ptr<DateTimeFormatter>> getDateTimeFormatter(
const std::string_view& format,
DateTimeFormatterType type) {
switch (type) {
case DateTimeFormatterType::STRICT_SIMPLE:
return buildSimpleDateTimeFormatter(format, /*lenient=*/false);
case DateTimeFormatterType::LENIENT_SIMPLE:
return buildSimpleDateTimeFormatter(format, /*lenient=*/true);
default:
struct DateTimeFormatterProvider {
FOLLY_ALWAYS_INLINE void init(const core::QueryConfig& config) {
legacyFormatter_ = config.sparkLegacyDateFormatter();
firstDayOfWeek_ = config.sparkFirstDayOfWeek();
minimalDaysInFirstWeek_ = config.sparkMinimalDaysInFirstWeek();
}

FOLLY_ALWAYS_INLINE auto get(const std::string_view& format) {
if (legacyFormatter_) {
return buildSimpleDateTimeFormatter(format, /*lenient=*/false)
.then([this](std::shared_ptr<DateTimeFormatter> f) {
f->setFirstDayOfWeek(firstDayOfWeek_);
f->setMinimalDaysInFirstWeek(minimalDaysInFirstWeek_);
return f;
});
} else {
return buildJodaDateTimeFormatter(
std::string_view(format.data(), format.size()));
}
}
}

std::shared_ptr<DateTimeFormatter> formatter_;
bool legacyFormatter_{false};
uint8_t firstDayOfWeek_ = 2;
uint8_t minimalDaysInFirstWeek_ = 4;
};
} // namespace detail

template <typename T>
Expand Down Expand Up @@ -127,10 +140,8 @@ struct UnixTimestampParseFunction {
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& config,
const arg_type<Varchar>* /*input*/) {
auto formatter = detail::getDateTimeFormatter(
kDefaultFormat_,
config.sparkLegacyDateFormatter() ? DateTimeFormatterType::STRICT_SIMPLE
: DateTimeFormatterType::JODA);
formatter_provider_.init(config);
auto formatter = formatter_provider_.get(kDefaultFormat_);
VELOX_CHECK(!formatter.hasError(), "Default format should always be valid");
format_ = formatter.value();
setTimezone(config);
Expand Down Expand Up @@ -166,6 +177,7 @@ struct UnixTimestampParseFunction {
// Default if format is not specified, as per Spark documentation.
constexpr static std::string_view kDefaultFormat_{"yyyy-MM-dd HH:mm:ss"};
std::shared_ptr<DateTimeFormatter> format_;
detail::DateTimeFormatterProvider formatter_provider_;
const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // fallback to GMT.
};

Expand All @@ -181,12 +193,10 @@ struct UnixTimestampParseWithFormatFunction
const core::QueryConfig& config,
const arg_type<Varchar>* /*input*/,
const arg_type<Varchar>* format) {
legacyFormatter_ = config.sparkLegacyDateFormatter();
this->formatter_provider_.init(config);
if (format != nullptr) {
auto formatter = detail::getDateTimeFormatter(
std::string_view(format->data(), format->size()),
legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE
: DateTimeFormatterType::JODA);
auto formatter = this->formatter_provider_.get(
std::string_view(format->data(), format->size()));
if (formatter.hasError()) {
invalidFormat_ = true;
} else {
Expand All @@ -207,10 +217,8 @@ struct UnixTimestampParseWithFormatFunction

// Format error returns null.
if (!isConstFormat_) {
auto formatter = detail::getDateTimeFormatter(
std::string_view(format.data(), format.size()),
legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE
: DateTimeFormatterType::JODA);
auto formatter = this->formatter_provider_.get(
std::string_view(format.data(), format.size()));
if (formatter.hasError()) {
return false;
}
Expand All @@ -230,7 +238,6 @@ struct UnixTimestampParseWithFormatFunction
private:
bool isConstFormat_{false};
bool invalidFormat_{false};
bool legacyFormatter_{false};
};

// Parses unix time in seconds to a formatted string.
Expand All @@ -243,7 +250,7 @@ struct FromUnixtimeFunction {
const core::QueryConfig& config,
const arg_type<int64_t>* /*unixtime*/,
const arg_type<Varchar>* format) {
legacyFormatter_ = config.sparkLegacyDateFormatter();
formatter_provider_.init(config);
sessionTimeZone_ = getTimeZoneFromConfig(config);
if (format != nullptr) {
setFormatter(*format);
Expand All @@ -268,21 +275,19 @@ struct FromUnixtimeFunction {

private:
FOLLY_ALWAYS_INLINE void setFormatter(const arg_type<Varchar>& format) {
formatter_ = detail::getDateTimeFormatter(
std::string_view(format.data(), format.size()),
legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE
: DateTimeFormatterType::JODA)
.thenOrThrow(folly::identity, [&](const Status& status) {
VELOX_USER_FAIL("{}", status.message());
});
formatter_ =
formatter_provider_.get(std::string_view(format.data(), format.size()))
.thenOrThrow(folly::identity, [&](const Status& status) {
VELOX_USER_FAIL("{}", status.message());
});
maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_);
}

const tz::TimeZone* sessionTimeZone_{nullptr};
std::shared_ptr<DateTimeFormatter> formatter_;
uint32_t maxResultSize_;
bool isConstantTimeFormat_{false};
bool legacyFormatter_{false};
detail::DateTimeFormatterProvider formatter_provider_;
};

template <typename T>
Expand Down Expand Up @@ -356,16 +361,14 @@ struct GetTimestampFunction {
const core::QueryConfig& config,
const arg_type<Varchar>* /*input*/,
const arg_type<Varchar>* format) {
legacyFormatter_ = config.sparkLegacyDateFormatter();
formatter_provider_.init(config);
auto sessionTimezoneName = config.sessionTimezone();
if (!sessionTimezoneName.empty()) {
sessionTimeZone_ = tz::locateZone(sessionTimezoneName);
}
if (format != nullptr) {
formatter_ = detail::getDateTimeFormatter(
std::string_view(*format),
legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE
: DateTimeFormatterType::JODA)
formatter_ = formatter_provider_
.get(std::string_view(format->data(), format->size()))
.thenOrThrow(folly::identity, [&](const Status& status) {
VELOX_USER_FAIL("{}", status.message());
});
Expand All @@ -378,10 +381,8 @@ struct GetTimestampFunction {
const arg_type<Varchar>& input,
const arg_type<Varchar>& format) {
if (!isConstantTimeFormat_) {
formatter_ = detail::getDateTimeFormatter(
std::string_view(format),
legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE
: DateTimeFormatterType::JODA)
formatter_ = formatter_provider_
.get(std::string_view(format.data(), format.size()))
.thenOrThrow(folly::identity, [&](const Status& status) {
VELOX_USER_FAIL("{}", status.message());
});
Expand All @@ -407,7 +408,7 @@ struct GetTimestampFunction {
std::shared_ptr<DateTimeFormatter> formatter_{nullptr};
bool isConstantTimeFormat_{false};
const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT.
bool legacyFormatter_{false};
detail::DateTimeFormatterProvider formatter_provider_;
};

template <typename T>
Expand Down
65 changes: 65 additions & 0 deletions velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -917,6 +917,71 @@ TEST_F(DateTimeFunctionsTest, fromUnixtime) {
fromUnixTime(getUnixTime("2020-06-30 23:59:59"), "yyyy-MM-dd HH:mm:ss"),
"2020-07-01 07:59:59");

// Weekyear cases of ISO-8601 standard.
queryCtx_->testingOverrideConfigUnsafe({
{core::QueryConfig::kSparkLegacyDateFormatter, "true"},
{core::QueryConfig::kSparkFirstDayOfWeek, std::to_string(2)},
{core::QueryConfig::kSparkMinimalDaysInFirstWeek, std::to_string(4)},
});
EXPECT_EQ(fromUnixTime(getUnixTime("2017-01-01 00:00:00"), "YYYY"), "2016");
EXPECT_EQ(fromUnixTime(getUnixTime("2017-12-31 00:00:00"), "YYYY"), "2017");
EXPECT_EQ(fromUnixTime(getUnixTime("2018-01-01 00:00:00"), "YYYY"), "2018");
EXPECT_EQ(fromUnixTime(getUnixTime("2018-12-31 00:00:00"), "YYYY"), "2019");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-01-01 00:00:00"), "YYYY"), "2019");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-30 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-31 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-01-01 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-31 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-01 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-02 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-03 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-31 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2022-01-01 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2022-01-02 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2022-12-31 00:00:00"), "YYYY"), "2022");

// Weekyear cases of spark legacy date formatter with default config.
queryCtx_->testingOverrideConfigUnsafe({
{core::QueryConfig::kSparkLegacyDateFormatter, "true"},
{core::QueryConfig::kSparkFirstDayOfWeek, std::to_string(1)},
{core::QueryConfig::kSparkMinimalDaysInFirstWeek, std::to_string(1)},
});
EXPECT_EQ(fromUnixTime(getUnixTime("2017-01-01 00:00:00"), "YYYY"), "2017");
EXPECT_EQ(fromUnixTime(getUnixTime("2017-12-31 00:00:00"), "YYYY"), "2018");
EXPECT_EQ(fromUnixTime(getUnixTime("2018-01-01 00:00:00"), "YYYY"), "2018");
EXPECT_EQ(fromUnixTime(getUnixTime("2018-12-30 00:00:00"), "YYYY"), "2019");
EXPECT_EQ(fromUnixTime(getUnixTime("2018-12-31 00:00:00"), "YYYY"), "2019");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-01-01 00:00:00"), "YYYY"), "2019");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-29 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-30 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2019-12-31 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-01-01 00:00:00"), "YYYY"), "2020");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-27 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-28 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-29 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-30 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2020-12-31 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-01-01 00:00:00"), "YYYY"), "2021");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-26 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-27 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-28 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-29 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-30 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2021-12-31 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2022-01-01 00:00:00"), "YYYY"), "2022");
EXPECT_EQ(fromUnixTime(getUnixTime("2022-12-31 00:00:00"), "YYYY"), "2022");

// Week config should only apply to spark legacy date formatter.
queryCtx_->testingOverrideConfigUnsafe({
{core::QueryConfig::kSparkLegacyDateFormatter, "false"},
{core::QueryConfig::kSparkFirstDayOfWeek, std::to_string(1)},
{core::QueryConfig::kSparkMinimalDaysInFirstWeek, std::to_string(1)},
});
EXPECT_EQ(fromUnixTime(getUnixTime("2017-12-31 00:00:00"), "x"), "2017");

// Reset config
queryCtx_->testingOverrideConfigUnsafe({});

// Invalid format.
VELOX_ASSERT_THROW(
fromUnixTime(0, "yyyy-AA"), "Specifier A is not supported.");
Expand Down

0 comments on commit 174b4bc

Please sign in to comment.