Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support WEEK_YEAR for date time formatter #10930

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions velox/core/QueryConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,22 @@ class QueryConfig {
static constexpr const char* kSparkLegacyDateFormatter =
"spark.legacy_date_formatter";

/// The first day-of-week varies by culture.
/// firstDayOfWeek is a 1-based weekday number starting with Sunday. It
/// determines how week-based calendar works. For example, the ISO-8601 use
/// Monday (2) and the US uses Sunday (1). It should be set to match the
/// 'Calender.getFirstDayOfWeek()' in Java. Sunday (1) is used by default.
static constexpr const char* kSparkFirstDayOfWeek =
"spark.legacy_date_formatter.first_day_of_week";

/// The minimal number of days in the first week by culture.
/// The week that includes January 1st and has 'minimalDaysInFirstWeek' or
/// more days is referred to as week 1. It determines how week-based calendar
/// works. It should be set to match the
/// 'Calender.getMinimalDaysInFirstWeek()' in Java. 1 days is used by default.
static constexpr const char* kSparkMinimalDaysInFirstWeek =
"spark.legacy_date_formatter.minimal_days_in_first_week";

/// The number of local parallel table writer operators per task.
static constexpr const char* kTaskWriterCount = "task_writer_count";

Expand Down Expand Up @@ -742,6 +758,22 @@ class QueryConfig {
return get<bool>(kSparkLegacyDateFormatter, false);
}

uint8_t sparkFirstDayOfWeek() const {
auto value = get<uint32_t>(kSparkFirstDayOfWeek, 1);
VELOX_USER_CHECK(
1 <= value && value <= 7,
"firstDayOfWeek must be a number between 1 and 7");
return static_cast<uint8_t>(value);
}

uint8_t sparkMinimalDaysInFirstWeek() const {
auto value = get<uint32_t>(kSparkMinimalDaysInFirstWeek, 1);
VELOX_USER_CHECK(
1 <= value && value <= 7,
"minimalDaysInFirstWeek must be a number between 1 and 7");
return static_cast<uint8_t>(value);
}

bool exprTrackCpuUsage() const {
return get<bool>(kExprTrackCpuUsage, false);
}
Expand Down
25 changes: 19 additions & 6 deletions velox/functions/lib/DateTimeFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "velox/external/date/iso_week.h"
#include "velox/external/date/tz.h"
#include "velox/functions/lib/DateTimeFormatterBuilder.h"
#include "velox/functions/lib/TimeUtils.h"
#include "velox/type/TimestampConversion.h"
#include "velox/type/tz/TimeZoneMap.h"

Expand Down Expand Up @@ -1175,6 +1176,7 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const {
size += 2;
break;
case DateTimeFormatSpecifier::YEAR_OF_ERA:
case DateTimeFormatSpecifier::WEEK_YEAR:
// Timestamp is in [-32767-01-01, 32767-12-31] range.
size += std::max((int)token.pattern.minRepresentDigits, 6);
break;
Expand Down Expand Up @@ -1245,7 +1247,6 @@ uint32_t DateTimeFormatter::maxResultSize(const tz::TimeZone* timezone) const {
}
break;
// Not supported.
case DateTimeFormatSpecifier::WEEK_YEAR:
default:
VELOX_UNSUPPORTED(
"Date format specifier is not supported: {}",
Expand Down Expand Up @@ -1542,7 +1543,22 @@ int32_t DateTimeFormatter::format(
result);
break;
}
case DateTimeFormatSpecifier::WEEK_YEAR:
case DateTimeFormatSpecifier::WEEK_YEAR: {
auto year = getWeekYear(
static_cast<int>(calDate.year()),
static_cast<uint32_t>(calDate.month()),
static_cast<uint32_t>(calDate.day()),
firstDayOfWeek_,
minimalDaysInFirstWeek_);

result += padContent(
static_cast<signed>(year),
'0',
token.pattern.minRepresentDigits,
maxResultEnd,
result);
break;
}
default:
VELOX_UNSUPPORTED(
"format is not supported for specifier {}",
Expand Down Expand Up @@ -2046,14 +2062,11 @@ Expected<std::shared_ptr<DateTimeFormatter>> buildSimpleDateTimeFormatter(
case 'W':
builder.appendWeekOfMonth(count);
break;
case 'x':
builder.appendWeekYear(count);
break;
case 'y':
builder.appendYear(count);
break;
case 'Y':
builder.appendYearOfEra(count);
builder.appendWeekYear(count);
break;
case 'z':
builder.appendTimeZone(count);
Expand Down
20 changes: 20 additions & 0 deletions velox/functions/lib/DateTimeFormatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,31 @@ class DateTimeFormatter {
bool allowOverflow = false,
const std::optional<std::string>& zeroOffsetText = std::nullopt) const;

void setFirstDayOfWeek(uint8_t firstDayOfWeek) {
firstDayOfWeek_ = firstDayOfWeek;
}

void setMinimalDaysInFirstWeek(uint8_t minimalDaysInFirstWeek) {
minimalDaysInFirstWeek_ = minimalDaysInFirstWeek;
}

private:
std::unique_ptr<char[]> literalBuf_;
size_t bufSize_;
std::vector<DateTimeToken> tokens_;
DateTimeFormatterType type_;

/// The first day-of-week varies by culture.
/// firstDayOfWeek is a 1-based weekday number starting with Sunday. It
/// determines how week-based calendar works. For example, the ISO-8601 use
/// Monday (2) and the US uses Sunday (1).
uint8_t firstDayOfWeek_ = 2;

/// The minimal number of days in the first week by culture.
/// The week that includes January 1st and has 'minimalDaysInFirstWeek' or
/// more days is referred to as week 1. It determines how week-based calendar
/// works. For example, the ISO-8601 use 4 days.
uint8_t minimalDaysInFirstWeek_ = 4;
};

Expected<std::shared_ptr<DateTimeFormatter>> buildMysqlDateTimeFormatter(
Expand Down
94 changes: 94 additions & 0 deletions velox/functions/lib/TimeUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "velox/external/date/date.h"
#include "velox/external/date/iso_week.h"
#include "velox/functions/Macros.h"
#include "velox/type/TimestampConversion.h"
#include "velox/type/tz/TimeZoneMap.h"

namespace facebook::velox::functions {
Expand Down Expand Up @@ -123,4 +124,97 @@ struct InitSessionTimezone {
timeZone_ = getTimeZoneFromConfig(config);
}
};

/// Return day-of-year (DOY) of the first `dayOfWeek` in the year.
///
/// `dayOfWeek` is a 1-based weekday number starting with Sunday.
/// (1 = Sunday, 2 = Monday, ..., 7 = Saturday).
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
///
/// If the `dayOfWeek` is Monday, it returns DOY of first Monday in
/// the year. The returned DOY is a number from 1 to 7.
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
FOLLY_ALWAYS_INLINE
uint32_t getDayOfFirstDayOfWeek(int32_t y, uint32_t dayOfWeek) {
auto firstDay =
date::year_month_day(date::year(y), date::month(1), date::day(1));
auto weekday = date::weekday(firstDay).c_encoding() + 1;

int32_t delta = dayOfWeek - weekday;
if (delta < 0) {
delta += 7;
}

return delta + 1;
}

/// Return the week year represented by Gregorian calendar for the given year,
/// month and day.
///
/// getWeekYear only works with Gregorian calendar due to limitations in the
/// date library. As a result, dates before the Gregorian calendar
/// (1582-10-15) yields mismatched results.
///
/// The week that includes January 1st and has 'minimalDaysInFirstWeek' or more
/// days is referred to as week 1. The starting day of the week is decided by
/// the `firstDayOfWeek`, which is a 1-based weekday number starting with
/// Sunday.
///
/// For ISO 8601, `firstDayOfWeek` is 2 (Monday) and `minimalDaysInFirstWeek`
/// is 4. For legacy Spark, `firstDayOfWeek` is 1 (Sunday) and
/// `minimalDaysInFirstWeek` is 1.
///
/// The algorithm refers to the getWeekYear algorithm in openjdk:
/// https://github.com/openjdk/jdk/blob/d9c67443f7d7f03efb2837b63ee2acc6113f737f/src/java.base/share/classes/java/util/GregorianCalendar.java#L2058
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ The upstream code is licensed under GPL 2.0 and while APIs and algorithms don't fall under copyright, afaik the implementation does. So this should probably not be merged without checking with some form of legal professional. @pedroerp @mbasmanova

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ccat3z @rui-mo is this different from the vendored date library we have at:

https://github.com/facebookincubator/velox/blob/main/velox/external/date/iso_week.h

we use it's iso_week functionality in a few places already. If it's the same logic, it would be nice to consolidate the usage.

Copy link
Contributor Author

@ccat3z ccat3z Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iso_week only support ISO 8601 standard week year (iso_week.h#L1519). The purpose of this implementation is to support non-ISO standard week year, which is required by Spark.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@assignUser @ccat3z @rui-mo I asked for guidance from license experts on this. Give me a day or two until they get back to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pedroerp Any updates?

FOLLY_ALWAYS_INLINE
int32_t getWeekYear(
int32_t y,
uint32_t m,
uint32_t d,
uint32_t firstDayOfWeek,
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
uint32_t minimalDaysInFirstWeek) {
const auto ymd =
date::year_month_day(date::year(y), date::month(m), date::day(d));
const auto firstDayOfTheYear =
date::year_month_day(ymd.year(), date::month(1), date::day(1));
const auto dayOfYear =
(date::sys_days{ymd} - date::sys_days{firstDayOfTheYear}).count() + 1;
const auto maxDayOfYear = util::isLeapYear(y) ? 366 : 365;

// If this week does not cross the years (`7 < dayOfYear && dayOfYear <
// (maxDayOfYear - 6)`), the weekyear must be equal to the year.
//
// If some days of this week fall in the last year and `minimalDaysInFirstWeek
// < dayOfYear`, the number of days in this week in this year must be greater
// than minimalDaysInFirstWeek, so the weekyear must be equal to the year.
//
// Since minimalDaysInFirstWeek always no more than 7, these two conditions
// can be reduced to the following code.
if (dayOfYear > minimalDaysInFirstWeek && dayOfYear < (maxDayOfYear - 6)) {
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
return y;
}

auto year = y;
// Day of beginning of first complete week of this year.
auto minDayOfYear = getDayOfFirstDayOfWeek(y, firstDayOfWeek);
if (dayOfYear >= minDayOfYear) {
// Day of ending of first week of the last year.
auto minDayOfYear = getDayOfFirstDayOfWeek(y + 1, firstDayOfWeek) - 1;
if (minDayOfYear == 0) {
minDayOfYear = 7;
}

// If that week belongs to the next weekyear.
if (minDayOfYear >= minimalDaysInFirstWeek) {
// If dayOfYear is in that week.
int days = maxDayOfYear - dayOfYear + 1;
if (days <= (7 - minDayOfYear)) {
++year;
}
}
} else if (minDayOfYear <= minimalDaysInFirstWeek) {
// Days of the first week in this year less then minimalDaysInFirstWeek
--year;
}

return year;
}
} // namespace facebook::velox::functions
1 change: 1 addition & 0 deletions velox/functions/lib/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_executable(
Re2FunctionsTest.cpp
RepeatTest.cpp
Utf8Test.cpp
TimeUtilsTest.cpp
ZetaDistributionTest.cpp)

add_test(
Expand Down
20 changes: 20 additions & 0 deletions velox/functions/lib/tests/DateTimeFormatterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,26 @@ TEST_F(JodaDateTimeFormatterTest, betterErrorMessaging) {
"Value 429 for dayOfMonth must be in the range [1,365] for year 2057 and month 2.");
}

TEST_F(JodaDateTimeFormatterTest, formatWeekYear) {
DateTimeFormatterBuilder builder(10);
auto formatter =
builder.appendWeekYear(4).setType(DateTimeFormatterType::JODA).build();
auto* timezone = tz::locateZone("GMT");
const auto maxSize = formatter->maxResultSize(timezone);

auto weekYear = [&](StringView time) {
std::string result(maxSize, '\0');
auto resultSize = formatter->format(
fromTimestampString(time), timezone, maxSize, result.data());
result.resize(resultSize);
return result;
};

EXPECT_EQ(weekYear("2019-12-31 00:00:00"), "2020");
EXPECT_EQ(weekYear("2020-12-26 00:00:00"), "2020");
EXPECT_EQ(weekYear("2021-01-01 00:00:00"), "2020");
}

class MysqlDateTimeTest : public DateTimeFormatterTest {};

TEST_F(MysqlDateTimeTest, validBuild) {
Expand Down
Loading
Loading