Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add datetime selector #1822

Merged
merged 31 commits into from
Feb 9, 2025
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
9d9b735
feat: datetime selector
FBruzzesi Jan 17, 2025
aedb28c
unit test
FBruzzesi Jan 17, 2025
438eb27
use .dt.convert_time_zone first
FBruzzesi Jan 18, 2025
ef1d271
maybe with backport
FBruzzesi Jan 18, 2025
4f05167
forgot pyproject ;)
FBruzzesi Jan 18, 2025
e5493e1
fail pyarrow on windows, use replace_time_zone
FBruzzesi Jan 18, 2025
7899103
fail for old pandas
FBruzzesi Jan 18, 2025
b7c21f7
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Jan 19, 2025
6b0b006
add is_order_dependent arg
FBruzzesi Jan 19, 2025
26ed283
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Jan 20, 2025
57672ec
it passes :)
FBruzzesi Jan 20, 2025
e5ad0c0
force pytest to run with utc
FBruzzesi Jan 20, 2025
319624f
Update pyproject.toml
FBruzzesi Jan 20, 2025
0edd7ca
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Jan 20, 2025
cb4823f
fix up
FBruzzesi Jan 20, 2025
36e215f
merge main
FBruzzesi Feb 1, 2025
9c8fbfb
arrow type hint
FBruzzesi Feb 1, 2025
11124f7
rethink the logic
FBruzzesi Feb 1, 2025
4010a6f
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Feb 3, 2025
a57dc1d
add pyspark & duckdb, test timezone.utc
FBruzzesi Feb 3, 2025
51c3d81
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Feb 4, 2025
b789c00
add 's' in default time_units, trim docstrings
FBruzzesi Feb 5, 2025
40f3d8f
rm kwargs={dtypes}
FBruzzesi Feb 5, 2025
a9d9f37
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Feb 5, 2025
b2d8818
whops
FBruzzesi Feb 5, 2025
610d553
Merge branch 'main' into feat/selectors-by-datetime
FBruzzesi Feb 8, 2025
242d2fa
Merge branch 'main' into feat/selectors-by-datetime
MarcoGorelli Feb 8, 2025
22eee5d
feedback adjustments
FBruzzesi Feb 8, 2025
c166dff
more docstring trimming
FBruzzesi Feb 8, 2025
64f7f5c
merge main and rm comment
FBruzzesi Feb 9, 2025
b998459
merge main
FBruzzesi Feb 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/selectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ set operations are supported:
- boolean
- by_dtype
- categorical
- datetime
- matches
- numeric
- string
Expand Down
2 changes: 2 additions & 0 deletions docs/api-reference/typing.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ Narwhals comes fully statically typed. In addition to `nw.DataFrame`, `nw.Expr`,
- IntoFrameT
- IntoSeries
- IntoSeriesT
- SizeUnit
- TimeUnit
show_source: false
show_bases: false

Expand Down
4 changes: 2 additions & 2 deletions narwhals/_arrow/expr_dt.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Literal

from narwhals._expression_parsing import reuse_series_namespace_implementation

if TYPE_CHECKING:
from typing_extensions import Self

from narwhals._arrow.expr import ArrowExpr
from narwhals.typing import TimeUnit


class ArrowExprDateTimeNamespace:
Expand All @@ -30,7 +30,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr:
self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone
)

def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> ArrowExpr:
def timestamp(self: Self, time_unit: TimeUnit) -> ArrowExpr:
return reuse_series_namespace_implementation(
self._compliant_expr, "dt", "timestamp", time_unit=time_unit
)
Expand Down
53 changes: 52 additions & 1 deletion narwhals/_arrow/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,24 @@
import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Sequence

from narwhals._arrow.expr import ArrowExpr
from narwhals.utils import Implementation
from narwhals.utils import _parse_time_unit_and_time_zone
from narwhals.utils import dtype_matches_time_unit_and_time_zone
from narwhals.utils import import_dtypes_module

if TYPE_CHECKING:
from datetime import timezone

from typing_extensions import Self

from narwhals._arrow.dataframe import ArrowDataFrame
from narwhals._arrow.series import ArrowSeries
from narwhals.dtypes import DType
from narwhals.typing import TimeUnit
from narwhals.utils import Version


Expand All @@ -26,7 +32,7 @@ def __init__(
self._implementation = Implementation.PYARROW
self._version = version

def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> ArrowSelector:
def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> ArrowSelector:
def func(df: ArrowDataFrame) -> list[ArrowSeries]:
return [df[col] for col in df.columns if df.schema[col] in dtypes]

Expand Down Expand Up @@ -108,6 +114,51 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
kwargs={},
)

def datetime(
self: Self,
time_unit: TimeUnit | Iterable[TimeUnit] | None,
time_zone: str | timezone | Iterable[str | timezone | None] | None,
) -> ArrowSelector:
dtypes = import_dtypes_module(version=self._version)
time_units, time_zones = _parse_time_unit_and_time_zone(
time_unit=time_unit, time_zone=time_zone
)

def func(df: ArrowDataFrame) -> list[ArrowSeries]:
return [
df[col]
for col in df.columns
if dtype_matches_time_unit_and_time_zone(
dtype=df.schema[col],
dtypes=dtypes,
time_units=time_units,
time_zones=time_zones,
)
]

def evalute_output_names(df: ArrowDataFrame) -> Sequence[str]:
return [
col
for col in df.columns
if dtype_matches_time_unit_and_time_zone(
dtype=df.schema[col],
dtypes=dtypes,
time_units=time_units,
time_zones=time_zones,
)
]

return ArrowSelector(
func,
depth=0,
function_name="selector",
evaluate_output_names=evalute_output_names,
alias_output_names=None,
backend_version=self._backend_version,
version=self._version,
kwargs={"dtypes": dtypes},
)


class ArrowSelector(ArrowExpr):
def __repr__(self: Self) -> str: # pragma: no cover
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_arrow/series_dt.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Literal

import pyarrow as pa
import pyarrow.compute as pc
Expand All @@ -13,6 +12,7 @@
from typing_extensions import Self

from narwhals._arrow.series import ArrowSeries
from narwhals.typing import TimeUnit


class ArrowSeriesDateTimeNamespace:
Expand Down Expand Up @@ -49,7 +49,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries:

return self._compliant_series._from_native_series(result)

def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> ArrowSeries:
def timestamp(self: Self, time_unit: TimeUnit) -> ArrowSeries:
s = self._compliant_series._native_series
dtype = self._compliant_series.dtype
dtypes = import_dtypes_module(self._compliant_series._version)
Expand Down
6 changes: 3 additions & 3 deletions narwhals/_dask/expr_dt.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Literal

from narwhals._pandas_like.utils import calculate_timestamp_date
from narwhals._pandas_like.utils import calculate_timestamp_datetime
Expand All @@ -18,6 +17,7 @@
from typing_extensions import Self

from narwhals._dask.expr import DaskExpr
from narwhals.typing import TimeUnit


class DaskExprDateTimeNamespace:
Expand Down Expand Up @@ -145,8 +145,8 @@ def func(s: dx.Series, time_zone: str) -> dx.Series:
returns_scalar=self._compliant_expr._returns_scalar,
)

def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> DaskExpr:
def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"]) -> dx.Series:
def timestamp(self: Self, time_unit: TimeUnit) -> DaskExpr:
def func(s: dx.Series, time_unit: TimeUnit) -> dx.Series:
dtype = native_to_narwhals_dtype(
s, self._compliant_expr._version, Implementation.DASK
)
Expand Down
56 changes: 56 additions & 0 deletions narwhals/_dask/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,23 @@
from typing import Sequence

from narwhals._dask.expr import DaskExpr
from narwhals.utils import _parse_time_unit_and_time_zone
from narwhals.utils import dtype_matches_time_unit_and_time_zone
from narwhals.utils import import_dtypes_module

if TYPE_CHECKING:
try:
import dask.dataframe.dask_expr as dx
except ModuleNotFoundError:
import dask_expr as dx

from datetime import timezone

from typing_extensions import Self

from narwhals._dask.dataframe import DaskLazyFrame
from narwhals.dtypes import DType
from narwhals.typing import TimeUnit
from narwhals.utils import Version

try:
Expand Down Expand Up @@ -118,6 +128,52 @@ def func(df: DaskLazyFrame) -> list[dx.Series]:
kwargs={},
)

def datetime(
self: Self,
time_unit: TimeUnit | Iterable[TimeUnit] | None,
time_zone: str | timezone | Iterable[str | timezone | None] | None,
) -> DaskSelector: # pragma: no cover
Copy link
Member Author

@FBruzzesi FBruzzesi Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For dask, the selector works, but the cast fails with:

TypeError: Cannot use .astype to convert from timezone-aware dtype to timezone-naive dtype. Use obj.tz_localize(None) or obj.tz_convert('UTC').tz_localize(None) instead.

dtypes = import_dtypes_module(version=self._version)
time_units, time_zones = _parse_time_unit_and_time_zone(
time_unit=time_unit, time_zone=time_zone
)

def func(df: DaskLazyFrame) -> list[dx.Series]:
return [
df._native_frame[col]
for col in df.columns
if dtype_matches_time_unit_and_time_zone(
dtype=df.schema[col],
dtypes=dtypes,
time_units=time_units,
time_zones=time_zones,
)
]

def evalute_output_names(df: DaskLazyFrame) -> Sequence[str]:
return [
col
for col in df.columns
if dtype_matches_time_unit_and_time_zone(
dtype=df.schema[col],
dtypes=dtypes,
time_units=time_units,
time_zones=time_zones,
)
]

return DaskSelector(
func,
depth=0,
function_name="selector",
evaluate_output_names=evalute_output_names,
alias_output_names=None,
backend_version=self._backend_version,
returns_scalar=False,
version=self._version,
kwargs={},
)


class DaskSelector(DaskExpr):
def __repr__(self: Self) -> str: # pragma: no cover
Expand Down
49 changes: 49 additions & 0 deletions narwhals/_duckdb/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@

from narwhals._duckdb.expr import DuckDBExpr
from narwhals._duckdb.utils import ExprKind
from narwhals.utils import _parse_time_unit_and_time_zone
from narwhals.utils import dtype_matches_time_unit_and_time_zone
from narwhals.utils import import_dtypes_module

if TYPE_CHECKING:
from datetime import timezone

import duckdb
from typing_extensions import Self

from narwhals._duckdb.dataframe import DuckDBLazyFrame
from narwhals.dtypes import DType
from narwhals.typing import TimeUnit
from narwhals.utils import Version


Expand Down Expand Up @@ -111,6 +116,50 @@ def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
version=self._version,
)

def datetime(
self: Self,
time_unit: TimeUnit | Iterable[TimeUnit] | None,
time_zone: str | timezone | Iterable[str | timezone | None] | None,
) -> DuckDBSelector:
dtypes = import_dtypes_module(version=self._version)
time_units, time_zones = _parse_time_unit_and_time_zone(
time_unit=time_unit, time_zone=time_zone
)

def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
return [
ColumnExpression(col)
for col in df.columns
if dtype_matches_time_unit_and_time_zone(
dtype=df.schema[col],
dtypes=dtypes,
time_units=time_units,
time_zones=time_zones,
)
]

def evalute_output_names(df: DuckDBLazyFrame) -> Sequence[str]:
return [
col
for col in df.columns
if dtype_matches_time_unit_and_time_zone(
dtype=df.schema[col],
dtypes=dtypes,
time_units=time_units,
time_zones=time_zones,
)
]

return DuckDBSelector(
func,
function_name="selector",
evaluate_output_names=evalute_output_names,
alias_output_names=None,
backend_version=self._backend_version,
expr_kind=ExprKind.TRANSFORM,
version=self._version,
)


class DuckDBSelector(DuckDBExpr):
def __repr__(self: Self) -> str: # pragma: no cover
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_pandas_like/expr_dt.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Literal

from narwhals._expression_parsing import reuse_series_namespace_implementation

if TYPE_CHECKING:
from typing_extensions import Self

from narwhals._pandas_like.expr import PandasLikeExpr
from narwhals.typing import TimeUnit


class PandasLikeExprDateTimeNamespace:
Expand Down Expand Up @@ -101,7 +101,7 @@ def convert_time_zone(self: Self, time_zone: str) -> PandasLikeExpr:
self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone
)

def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> PandasLikeExpr:
def timestamp(self: Self, time_unit: TimeUnit) -> PandasLikeExpr:
return reuse_series_namespace_implementation(
self._compliant_expr, "dt", "timestamp", time_unit=time_unit
)
Loading
Loading