Skip to content

Commit

Permalink
Raise NotImplementedError in to_datetime if Z (or tz component) in st…
Browse files Browse the repository at this point in the history
…ring (#14074)

closes #14039
Avoids this discrepancy when a date string has a tz component 

```python
In [1]: import pandas

In [2]: import cudf

In [3]: data = ["2019-01-01T00:00:00.000Z"]

In [4]: cudf.to_datetime(data)
Out[4]: DatetimeIndex(['2019-01-01'], dtype='datetime64[ns]')

In [5]: pandas.to_datetime(data)
Out[5]: DatetimeIndex(['2019-01-01 00:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #14074
  • Loading branch information
mroeschke authored Sep 14, 2023
1 parent edfef80 commit 664dfc3
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 37 deletions.
15 changes: 10 additions & 5 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,10 @@ def infer_format(element: str, **kwargs) -> str:
fmt = _guess_datetime_format(element, **kwargs)

if fmt is not None:
if "%z" in fmt or "%Z" in fmt:
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
return fmt

element_parts = element.split(".")
Expand All @@ -651,11 +655,12 @@ def infer_format(element: str, **kwargs) -> str:
raise ValueError("Unable to infer the timestamp format from the data")

if len(second_parts) > 1:
# "Z" indicates Zulu time(widely used in aviation) - Which is
# UTC timezone that currently cudf only supports. Having any other
# unsupported timezone will let the code fail below
# with a ValueError.
second_parts.remove("Z")
# We may have a non-digit, timezone-like component
# like Z, UTC-3, +01:00
if any(re.search(r"\D", part) for part in second_parts):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
second_part = "".join(second_parts[1:])

if len(second_part) > 1:
Expand Down
49 changes: 23 additions & 26 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,40 +1250,31 @@ def test_datetime_reductions(data, op, dtype):
assert_eq(expected, actual)


@pytest.mark.parametrize("timezone", ["naive", "UTC"])
@pytest.mark.parametrize(
"data",
[
np.datetime_as_string(
np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"),
timezone="UTC",
),
np.datetime_as_string(
np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"),
timezone="UTC",
),
np.datetime_as_string(
np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"),
timezone="UTC",
),
np.datetime_as_string(
np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"),
timezone="UTC",
),
np.datetime_as_string(
np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"),
timezone="UTC",
),
np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"),
np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"),
np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"),
np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"),
np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"),
],
)
@pytest.mark.parametrize("dtype", DATETIME_TYPES)
def test_datetime_infer_format(data, dtype):
sr = cudf.Series(data)
psr = pd.Series(data)
def test_datetime_infer_format(data, timezone, dtype):
ts_data = np.datetime_as_string(data, timezone=timezone)
sr = cudf.Series(ts_data)
if timezone == "naive":
psr = pd.Series(ts_data)

expected = psr.astype(dtype)
actual = sr.astype(dtype)
expected = psr.astype(dtype)
actual = sr.astype(dtype)

assert_eq(expected, actual)
assert_eq(expected, actual)
else:
with pytest.raises(NotImplementedError):
sr.astype(dtype)


def test_dateoffset_instance_subclass_check():
Expand Down Expand Up @@ -2158,6 +2149,12 @@ def test_format_timezone_not_implemented(code):
)


@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"])
def test_no_format_timezone_not_implemented(tz):
with pytest.raises(NotImplementedError):
cudf.to_datetime([f"2020-01-01 00:00:00{tz}"])


@pytest.mark.parametrize("arg", [True, False])
def test_args_not_datetime_typerror(arg):
with pytest.raises(TypeError):
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ def test_string_astype(dtype):
data = ["True", "False", "True", "False", "False"]
elif dtype.startswith("datetime64"):
data = [
"2019-06-04T00:00:00Z",
"2019-06-04T12:12:12Z",
"2019-06-03T00:00:00Z",
"2019-05-04T00:00:00Z",
"2018-06-04T00:00:00Z",
"1922-07-21T01:02:03Z",
"2019-06-04T00:00:00",
"2019-06-04T12:12:12",
"2019-06-03T00:00:00",
"2019-05-04T00:00:00",
"2018-06-04T00:00:00",
"1922-07-21T01:02:03",
]
elif dtype == "str" or dtype == "object":
data = ["ab", "cd", "ef", "gh", "ij"]
Expand Down

0 comments on commit 664dfc3

Please sign in to comment.