Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Casting Varchar to Timestamp should handle offsets that are not recognized time zones #11849

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions velox/expression/PrestoCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,38 @@ Expected<Timestamp> PrestoCastHooks::castStringToTimestamp(
// If the parsed string has timezone information, convert the timestamp at
// GMT at that time. For example, "1970-01-01 00:00:00 -00:01" is 60 seconds
// at GMT.
if (result.second != nullptr) {
result.first.toGMT(*result.second);

}
// If no timezone information is available in the input string, check if we
// should understand it as being at the session timezone, and if so, convert
// to GMT.
else if (options_.timeZone != nullptr) {
result.first.toGMT(*options_.timeZone);
if (result.timeZone != nullptr) {
result.timestamp.toGMT(*result.timeZone);
} else {
if (result.offsetMillis.has_value()) {
auto seconds = result.timestamp.getSeconds();
// use int128_t to avoid overflow.
int128_t nanos = result.timestamp.getNanos();

seconds += result.offsetMillis.value() / util::kMillisPerSecond;
nanos += (result.offsetMillis.value() % util::kMillisPerSecond) *
util::kNanosPerMicro * util::kMicrosPerMsec;

if (nanos < 0) {
seconds -= 1;
nanos += Timestamp::kNanosInSecond;
}

if (nanos > Timestamp::kMaxNanos) {
seconds += 1;
nanos -= Timestamp::kNanosInSecond;
}

result.timestamp = Timestamp(seconds, nanos);
}
// If no timezone information is available in the input string, check if we
// should understand it as being at the session timezone, and if so, convert
// to GMT.
if (options_.timeZone != nullptr) {
result.timestamp.toGMT(*options_.timeZone);
}
}
return result.first;
return result.timestamp;
}

Expected<Timestamp> PrestoCastHooks::castIntToTimestamp(int64_t seconds) const {
Expand Down
74 changes: 74 additions & 0 deletions velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,8 +548,41 @@ TEST_F(CastExprTest, stringToTimestamp) {
"1970-01-01 00:00:00-02:00",
"1970-01-01 00:00:00 +02",
"1970-01-01 00:00:00 -0101",
// Fully specified offset.
"1970-01-01 00:00:00 +01:01:01.001",
"1970-01-02 00:00:00 -01:01:01.001",
// Offset with two digit milliseconds.
"1970-01-01 00:00:00 +01:01:01.01",
"1970-01-02 00:00:00 -01:01:01.01",
// Offset with one digit milliseconds.
"1970-01-01 00:00:00 +01:01:01.1",
"1970-01-02 00:00:00 -01:01:01.1",
// Offset without milliseconds.
"1970-01-01 00:00:00 +01:01:01",
"1970-01-02 00:00:00 -01:01:01",
// Offset without seconds.
"1970-01-01 00:00:00 +23:01",
"1970-01-02 00:00:00 -23:01",
// Offset without minutes.
"1970-01-01 00:00:00 +23",
"1970-01-02 00:00:00 -23",
// Upper and lower limits of offsets.
"2000-01-01 12:13:14.123+23:59:59.999",
"2000-01-01 12:13:14.123-23:59:59.999",
// Comma instead of period for decimal in offset.
"1970-01-01 00:00:00 +01:01:01,001",
// Trailing spaces after offset.
"1970-01-02 00:00:00 -01:01:01.001 ",
// Overflow of nanoseconds in offset.
"1970-01-01 00:00:00.999 +01:01:01.002",
// Underflow of nanoseconds in offset.
"1970-01-02 00:00:00.001 -01:01:01.002",
// No optional separators
"1970-01-01 00:00:00 +010101001",
std::nullopt,
};

// two spaces, too large
std::vector<std::optional<Timestamp>> expected{
Timestamp(0, 0),
Timestamp(10800, 0),
Expand All @@ -559,6 +592,25 @@ TEST_F(CastExprTest, stringToTimestamp) {
Timestamp(7200, 0),
Timestamp(-7200, 0),
Timestamp(3660, 0),
Timestamp(3661, 1000000),
Timestamp(82738, 999000000),
Timestamp(3661, 10000000),
Timestamp(82738, 990000000),
Timestamp(3661, 100000000),
Timestamp(82738, 900000000),
Timestamp(3661, 0),
Timestamp(82739, 0),
Timestamp(82860, 0),
Timestamp(3540, 0),
Timestamp(82800, 0),
Timestamp(3600, 0),
Timestamp(946815194, 122000000),
Timestamp(946642394, 124000000),
Timestamp(3661, 1000000),
Timestamp(82738, 999000000),
Timestamp(3662, 1000000),
Timestamp(82738, 999000000),
Timestamp(3661, 1000000),
std::nullopt,
};
testCast<std::string, Timestamp>("timestamp", input, expected);
Expand All @@ -571,13 +623,15 @@ TEST_F(CastExprTest, stringToTimestamp) {
"1970-01-01 00:00 +01:00",
"1970-01-01 00:00 America/Sao_Paulo",
"2000-01-01 12:21:56Z",
"2000-01-01 12:21:56+01:01:01",
};
expected = {
Timestamp(28800, 0),
Timestamp(0, 0),
Timestamp(-3600, 0),
Timestamp(10800, 0),
Timestamp(946729316, 0),
Timestamp(946761777, 0),
};
testCast<std::string, Timestamp>("timestamp", input, expected);

Expand All @@ -602,6 +656,26 @@ TEST_F(CastExprTest, stringToTimestamp) {
(evaluateOnce<Timestamp, std::string>(
"try_cast(c0 as timestamp)", "2045-12-31 18:00:00")),
"Unable to convert timezone 'America/Los_Angeles' past 2037-11-01 09:00:00");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:01:01")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:01:01' to TIMESTAMP. Unknown timezone value: \"\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +24")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +24' to TIMESTAMP. Unknown timezone value: \"+24\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:60")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:60' to TIMESTAMP. Unknown timezone value: \"+01:60\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:01:60")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:01:60' to TIMESTAMP. Unknown timezone value: \"+01:01:60\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "2000-01-01 00:00:00 +01:01:01.1000")),
"Cannot cast VARCHAR '2000-01-01 00:00:00 +01:01:01.1000' to TIMESTAMP. Unknown timezone value: \"+01:01:01.1000\"");

setLegacyCast(true);
input = {
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -1520,7 +1520,8 @@ struct FromIso8601Timestamp {
return castResult.error();
}

auto [ts, timeZone] = castResult.value();
auto [ts, timeZone, offsetMillis] = castResult.value();
VELOX_DCHECK(!offsetMillis.has_value());
// Input string may not contain a timezone - if so, it is interpreted in
// session timezone.
if (!timeZone) {
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4366,6 +4366,9 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Timestamp) {
VELOX_ASSERT_THROW(
fromIso("1970-01-02T11:38:56.123 America/New_York"),
R"(Unable to parse timestamp value: "1970-01-02T11:38:56.123 America/New_York")");
VELOX_ASSERT_THROW(
fromIso("1970-01-02T11:38:56+16:00:01"),
"Unknown timezone value: \"+16:00:01\"");

VELOX_ASSERT_THROW(fromIso("T"), R"(Unable to parse timestamp value: "T")");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarcharInvalidInput) {
const auto invalidStringVector6 =
makeNullableFlatVector<StringView>({"2012-10 America/Los_Angeles"});

const auto invalidStringVector7 =
makeNullableFlatVector<StringView>({"2012-10-01 +16:00"});

auto millis = parseTimestamp("2012-10-31 07:00:47").toMillis();
auto timestamps = std::vector<int64_t>{millis};

Expand Down Expand Up @@ -204,6 +207,9 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarcharInvalidInput) {
VELOX_ASSERT_THROW(
testCast(invalidStringVector6, expected),
"Unable to parse timestamp value: \"2012-10 America/Los_Angeles\"");
VELOX_ASSERT_THROW(
testCast(invalidStringVector7, expected),
"Unknown timezone value in: \"2012-10-01 +16:00\"");
}

TEST_F(TimestampWithTimeZoneCastTest, toTimestamp) {
Expand Down
10 changes: 9 additions & 1 deletion velox/functions/prestosql/types/TimestampWithTimeZoneType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,18 @@ void castFromString(
if (castResult.hasError()) {
context.setStatus(row, castResult.error());
} else {
auto [ts, timeZone] = castResult.value();
auto [ts, timeZone, millisOffset] = castResult.value();
// Input string may not contain a timezone - if so, it is interpreted in
// session timezone.
if (timeZone == nullptr) {
if (millisOffset.has_value()) {
context.setStatus(
row,
Status::UserError(
"Unknown timezone value in: \"{}\"",
inputVector.valueAt(row)));
return;
}
const auto& config = context.execCtx()->queryCtx()->queryConfig();
timeZone = getTimeZoneFromConfig(config);
}
Expand Down
137 changes: 125 additions & 12 deletions velox/type/TimestampConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,11 +529,116 @@ bool tryParseTimeString(
return true;
}

// Parses a variety of timestamp strings, depending on the value of `parseMode`.
// Consumes as much of the string as it can and sets `result` to the
// timestamp from whatever it successfully parses. `pos` is set to the position
// of first character that was not consumed. Returns true if it successfully
// parsed at least a date, `result` is only set if true is returned.
// String format is [+/-]hh:mm:ss.MMM (minutes, seconds, and milliseconds are
// optional).
bool tryParsePrestoTimeOffsetString(
const char* buf,
size_t len,
size_t& pos,
int64_t& result) {
static constexpr int sep = ':';
int32_t hour = 0, min = 0, sec = 0, millis = 0;
pos = 0;
result = 0;

if (len == 0) {
return false;
}

if (buf[pos] != '+' && buf[pos] != '-') {
return false;
}

bool positive = buf[pos++] == '+';

if (pos >= len) {
return false;
}

// Read the hours.
if (!parseDoubleDigit(buf, len, pos, hour)) {
return false;
}
if (hour < 0 || hour >= 24) {
return false;
}

result += hour * kMillisPerHour;

if (pos >= len || (buf[pos] != sep && !characterIsDigit(buf[pos]))) {
result *= positive ? 1 : -1;
return pos == len;
}

// Skip the separator.
if (buf[pos] == sep) {
pos++;
}

// Read the minutes.
if (!parseDoubleDigit(buf, len, pos, min)) {
return false;
}
if (min < 0 || min >= 60) {
return false;
}

result += min * kMillisPerMinute;

if (pos >= len || (buf[pos] != sep && !characterIsDigit(buf[pos]))) {
result *= positive ? 1 : -1;
return pos == len;
}

// Skip the separator.
if (buf[pos] == sep) {
pos++;
}

// Try to read seconds.
if (!parseDoubleDigit(buf, len, pos, sec)) {
return false;
}
if (sec < 0 || sec >= 60) {
return false;
}

result += sec * kMillisPerSecond;

if (pos >= len ||
(buf[pos] != '.' && buf[pos] != ',' && !characterIsDigit(buf[pos]))) {
result *= positive ? 1 : -1;
return pos == len;
}

// Skip the decimal.
if (buf[pos] == '.' || buf[pos] == ',') {
pos++;
}

// Try to read microseconds.
if (pos >= len) {
return false;
}

// We expect milliseconds.
int32_t mult = 100;
for (; pos < len && mult > 0 && characterIsDigit(buf[pos]);
pos++, mult /= 10) {
millis += (buf[pos] - '0') * mult;
}

result += millis;
result *= positive ? 1 : -1;
return pos == len;
}

// Parses a variety of timestamp strings, depending on the value of
// `parseMode`. Consumes as much of the string as it can and sets `result` to
// the timestamp from whatever it successfully parses. `pos` is set to the
// position of first character that was not consumed. Returns true if it
// successfully parsed at least a date, `result` is only set if true is
// returned.
bool tryParseTimestampString(
const char* buf,
size_t len,
Expand Down Expand Up @@ -583,8 +688,8 @@ bool tryParseTimestampString(
if (!tryParseTimeString(
buf + pos, len - pos, timePos, microsSinceMidnight, parseMode)) {
// The rest of the string is not a valid time, but it could be relevant to
// the caller (e.g. it could be a time zone), return the date we parsed and
// let them decide what to do with the rest.
// the caller (e.g. it could be a time zone), return the date we parsed
// and let them decide what to do with the rest.
result = fromDatetime(daysSinceEpoch, 0);
return true;
}
Expand Down Expand Up @@ -857,8 +962,7 @@ fromTimestampString(const char* str, size_t len, TimestampParseMode parseMode) {
return resultTimestamp;
}

Expected<std::pair<Timestamp, const tz::TimeZone*>>
fromTimestampWithTimezoneString(
Expected<ParsedTimestampWithTimeZone> fromTimestampWithTimezoneString(
const char* str,
size_t len,
TimestampParseMode parseMode) {
Expand All @@ -870,6 +974,7 @@ fromTimestampWithTimezoneString(
}

const tz::TimeZone* timeZone = nullptr;
std::optional<int64_t> offset = std::nullopt;

if (pos < len && parseMode != TimestampParseMode::kIso8601 &&
characterIsSpace(str[pos])) {
Expand All @@ -894,8 +999,16 @@ fromTimestampWithTimezoneString(
std::string_view timeZoneName(str + pos, timezonePos - pos);

if ((timeZone = tz::locateZone(timeZoneName, false)) == nullptr) {
return folly::makeUnexpected(
Status::UserError("Unknown timezone value: \"{}\"", timeZoneName));
int64_t offsetMillis = 0;
size_t offsetPos = 0;
if (parseMode == TimestampParseMode::kPrestoCast &&
tryParsePrestoTimeOffsetString(
str + pos, timezonePos - pos, offsetPos, offsetMillis)) {
offset = offsetMillis;
} else {
return folly::makeUnexpected(
Status::UserError("Unknown timezone value: \"{}\"", timeZoneName));
}
}

// Skip any spaces at the end.
Expand All @@ -908,7 +1021,7 @@ fromTimestampWithTimezoneString(
return folly::makeUnexpected(parserError(str, len));
}
}
return std::make_pair(resultTimestamp, timeZone);
return {{resultTimestamp, timeZone, offset}};
}

int32_t toDate(const Timestamp& timestamp, const tz::TimeZone* timeZone_) {
Expand Down
Loading
Loading