diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md index aa990ab18..bf1fc9d6a 100644 --- a/docs/api-reference/selectors.md +++ b/docs/api-reference/selectors.md @@ -15,6 +15,7 @@ set operations are supported: - boolean - by_dtype - categorical + - datetime - matches - numeric - string diff --git a/docs/api-reference/typing.md b/docs/api-reference/typing.md index 9791b50a4..4097b6378 100644 --- a/docs/api-reference/typing.md +++ b/docs/api-reference/typing.md @@ -17,6 +17,8 @@ Narwhals comes fully statically typed. In addition to `nw.DataFrame`, `nw.Expr`, - IntoFrameT - IntoSeries - IntoSeriesT + - SizeUnit + - TimeUnit show_source: false show_bases: false diff --git a/narwhals/_arrow/expr_dt.py b/narwhals/_arrow/expr_dt.py index d5e596500..30d2e22c8 100644 --- a/narwhals/_arrow/expr_dt.py +++ b/narwhals/_arrow/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._expression_parsing import reuse_series_namespace_implementation @@ -9,6 +8,7 @@ from typing_extensions import Self from narwhals._arrow.expr import ArrowExpr + from narwhals.typing import TimeUnit class ArrowExprDateTimeNamespace: @@ -30,7 +30,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> ArrowExpr: + def timestamp(self: Self, time_unit: TimeUnit) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, "dt", "timestamp", time_unit=time_unit ) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 15ce43395..50c02d9b8 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -3,18 +3,24 @@ import re from typing import TYPE_CHECKING from typing import Any +from typing import Iterable from typing import Sequence from narwhals._arrow.expr import ArrowExpr from narwhals.utils import Implementation +from narwhals.utils import _parse_time_unit_and_time_zone +from narwhals.utils import dtype_matches_time_unit_and_time_zone from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from datetime import timezone + from typing_extensions import Self from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -26,7 +32,7 @@ def __init__( self._implementation = Implementation.PYARROW self._version = version - def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> ArrowSelector: + def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: return [df[col] for col in df.columns if df.schema[col] in dtypes] @@ -108,6 +114,51 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> ArrowSelector: + dtypes = import_dtypes_module(version=self._version) + time_units, time_zones = _parse_time_unit_and_time_zone( + time_unit=time_unit, time_zone=time_zone + ) + + def func(df: ArrowDataFrame) -> list[ArrowSeries]: + return [ + df[col] + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + def evalute_output_names(df: ArrowDataFrame) -> Sequence[str]: + return [ + col + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + return ArrowSelector( + func, + depth=0, + function_name="selector", + evaluate_output_names=evalute_output_names, + alias_output_names=None, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + class ArrowSelector(ArrowExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_arrow/series_dt.py b/narwhals/_arrow/series_dt.py index 51cd15357..7f10324de 100644 --- a/narwhals/_arrow/series_dt.py +++ b/narwhals/_arrow/series_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal import pyarrow as pa import pyarrow.compute as pc @@ -13,6 +12,7 @@ from typing_extensions import Self from narwhals._arrow.series import ArrowSeries + from narwhals.typing import TimeUnit class ArrowSeriesDateTimeNamespace: @@ -49,7 +49,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: return self._compliant_series._from_native_series(result) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> ArrowSeries: + def timestamp(self: Self, time_unit: TimeUnit) -> ArrowSeries: s = self._compliant_series._native_series dtype = self._compliant_series.dtype dtypes = import_dtypes_module(self._compliant_series._version) diff --git a/narwhals/_dask/expr_dt.py b/narwhals/_dask/expr_dt.py index c282304b2..a97404372 100644 --- a/narwhals/_dask/expr_dt.py +++ b/narwhals/_dask/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime @@ -18,6 +17,7 @@ from typing_extensions import Self from narwhals._dask.expr import DaskExpr + from narwhals.typing import TimeUnit class DaskExprDateTimeNamespace: @@ -145,8 +145,8 @@ def func(s: dx.Series, time_zone: str) -> dx.Series: returns_scalar=self._compliant_expr._returns_scalar, ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> DaskExpr: - def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"]) -> dx.Series: + def timestamp(self: Self, time_unit: TimeUnit) -> DaskExpr: + def func(s: dx.Series, time_unit: TimeUnit) -> dx.Series: dtype = native_to_narwhals_dtype( s.dtype, self._compliant_expr._version, Implementation.DASK ) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index f411d0a98..9c9c78965 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -7,13 +7,23 @@ from typing import Sequence from narwhals._dask.expr import DaskExpr +from narwhals.utils import _parse_time_unit_and_time_zone +from narwhals.utils import dtype_matches_time_unit_and_time_zone from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx + + from datetime import timezone + from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version try: @@ -118,6 +128,52 @@ def func(df: DaskLazyFrame) -> list[dx.Series]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> DaskSelector: # pragma: no cover + dtypes = import_dtypes_module(version=self._version) + time_units, time_zones = _parse_time_unit_and_time_zone( + time_unit=time_unit, time_zone=time_zone + ) + + def func(df: DaskLazyFrame) -> list[dx.Series]: + return [ + df._native_frame[col] + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + def evalute_output_names(df: DaskLazyFrame) -> Sequence[str]: + return [ + col + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + return DaskSelector( + func, + depth=0, + function_name="selector", + evaluate_output_names=evalute_output_names, + alias_output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + version=self._version, + kwargs={}, + ) + class DaskSelector(DaskExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 28802c031..f40794ef3 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -10,14 +10,19 @@ from narwhals._duckdb.expr import DuckDBExpr from narwhals._duckdb.utils import ExprKind +from narwhals.utils import _parse_time_unit_and_time_zone +from narwhals.utils import dtype_matches_time_unit_and_time_zone from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from datetime import timezone + import duckdb from typing_extensions import Self from narwhals._duckdb.dataframe import DuckDBLazyFrame from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -111,6 +116,50 @@ def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: version=self._version, ) + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> DuckDBSelector: + dtypes = import_dtypes_module(version=self._version) + time_units, time_zones = _parse_time_unit_and_time_zone( + time_unit=time_unit, time_zone=time_zone + ) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + return [ + ColumnExpression(col) + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + def evalute_output_names(df: DuckDBLazyFrame) -> Sequence[str]: + return [ + col + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + return DuckDBSelector( + func, + function_name="selector", + evaluate_output_names=evalute_output_names, + alias_output_names=None, + backend_version=self._backend_version, + expr_kind=ExprKind.TRANSFORM, + version=self._version, + ) + class DuckDBSelector(DuckDBExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/_pandas_like/expr_dt.py b/narwhals/_pandas_like/expr_dt.py index 9ec8648ec..ed5ffe8f1 100644 --- a/narwhals/_pandas_like/expr_dt.py +++ b/narwhals/_pandas_like/expr_dt.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Literal from narwhals._expression_parsing import reuse_series_namespace_implementation @@ -9,6 +8,7 @@ from typing_extensions import Self from narwhals._pandas_like.expr import PandasLikeExpr + from narwhals.typing import TimeUnit class PandasLikeExprDateTimeNamespace: @@ -101,7 +101,7 @@ def convert_time_zone(self: Self, time_zone: str) -> PandasLikeExpr: self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> PandasLikeExpr: + def timestamp(self: Self, time_unit: TimeUnit) -> PandasLikeExpr: return reuse_series_namespace_implementation( self._compliant_expr, "dt", "timestamp", time_unit=time_unit ) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 5074425ef..b8cd33b71 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -7,14 +7,19 @@ from typing import Sequence from narwhals._pandas_like.expr import PandasLikeExpr +from narwhals.utils import _parse_time_unit_and_time_zone +from narwhals.utils import dtype_matches_time_unit_and_time_zone from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from datetime import timezone + from typing_extensions import Self from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Implementation from narwhals.utils import Version @@ -116,6 +121,52 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: kwargs={}, ) + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> PandasSelector: + dtypes = import_dtypes_module(version=self._version) + time_units, time_zones = _parse_time_unit_and_time_zone( + time_unit=time_unit, time_zone=time_zone + ) + + def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: + return [ + df[col] + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + def evalute_output_names(df: PandasLikeDataFrame) -> Sequence[str]: + return [ + col + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + return PandasSelector( + func, + depth=0, + function_name="selector", + evaluate_output_names=evalute_output_names, + alias_output_names=None, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + class PandasSelector(PandasLikeExpr): def __repr__(self) -> str: # pragma: no cover diff --git a/narwhals/_pandas_like/series_dt.py b/narwhals/_pandas_like/series_dt.py index 0d480e86e..bbebbc6b7 100644 --- a/narwhals/_pandas_like/series_dt.py +++ b/narwhals/_pandas_like/series_dt.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING from typing import Any -from typing import Literal from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime @@ -14,6 +13,7 @@ from typing_extensions import Self from narwhals._pandas_like.series import PandasLikeSeries + from narwhals.typing import TimeUnit class PandasLikeSeriesDateTimeNamespace: @@ -208,7 +208,7 @@ def convert_time_zone(self: Self, time_zone: str) -> PandasLikeSeries: result = self._compliant_series._native_series.dt.tz_convert(time_zone) return self._compliant_series._from_native_series(result) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"]) -> PandasLikeSeries: + def timestamp(self: Self, time_unit: TimeUnit) -> PandasLikeSeries: s = self._compliant_series._native_series dtype = self._compliant_series.dtype is_pyarrow_dtype = "pyarrow" in str(self._compliant_series._native_series.dtype) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 7c65e28d3..28ca092a0 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable -from typing import Literal from typing import Sequence from typing import TypeVar @@ -30,6 +29,7 @@ from narwhals._pandas_like.series import PandasLikeSeries from narwhals.dtypes import DType from narwhals.typing import DTypeBackend + from narwhals.typing import TimeUnit from narwhals.typing import _1DArray ExprT = TypeVar("ExprT", bound=PandasLikeExpr) @@ -439,13 +439,13 @@ def non_object_native_to_narwhals_dtype( if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( match_ := PATTERN_PA_DATETIME.match(dtype) ): - dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + dt_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] dt_time_zone: str | None = match_.group("time_zone") return dtypes.Datetime(dt_time_unit, dt_time_zone) if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( match_ := PATTERN_PA_DURATION.match(dtype) ): - du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] + du_time_unit: TimeUnit = match_.group("time_unit") # type: ignore[assignment] return dtypes.Duration(du_time_unit) if dtype == "date32[day][pyarrow]": return dtypes.Date() diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 3b3d268ce..e1e00ba18 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -18,12 +18,15 @@ from narwhals.utils import Implementation if TYPE_CHECKING: + from datetime import timezone + from typing_extensions import Self from narwhals._polars.dataframe import PolarsDataFrame from narwhals._polars.dataframe import PolarsLazyFrame from narwhals._polars.expr import PolarsExpr from narwhals._polars.typing import IntoPolarsExpr + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -283,3 +286,18 @@ def all(self: Self) -> PolarsExpr: version=self._version, backend_version=self._backend_version, ) + + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> PolarsExpr: + import polars as pl + + from narwhals._polars.expr import PolarsExpr + + return PolarsExpr( + pl.selectors.datetime(time_unit=time_unit, time_zone=time_zone), # type: ignore[arg-type] + version=self._version, + backend_version=self._backend_version, + ) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 94510d2ef..929830316 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -3,7 +3,6 @@ from functools import lru_cache from typing import TYPE_CHECKING from typing import Any -from typing import Literal from typing import TypeVar from typing import overload @@ -17,6 +16,7 @@ from narwhals._polars.expr import PolarsExpr from narwhals._polars.series import PolarsSeries from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version T = TypeVar("T") @@ -111,11 +111,11 @@ def native_to_narwhals_dtype( if dtype == pl.Date: return dtypes.Date() if dtype == pl.Datetime: - dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + dt_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") dt_time_zone = getattr(dtype, "time_zone", None) return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone) if dtype == pl.Duration: - du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") + du_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") return dtypes.Duration(time_unit=du_time_unit) if dtype == pl.Struct: return dtypes.Struct( @@ -186,12 +186,12 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> pl if dtype == dtypes.Date: return pl.Date() if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime): - dt_time_unit: Literal["ms", "us", "ns"] = getattr(dtype, "time_unit", "us") + dt_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") dt_time_zone = getattr(dtype, "time_zone", None) - return pl.Datetime(dt_time_unit, dt_time_zone) + return pl.Datetime(dt_time_unit, dt_time_zone) # type: ignore[arg-type] if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration): - du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us") - return pl.Duration(time_unit=du_time_unit) + du_time_unit: TimeUnit = getattr(dtype, "time_unit", "us") + return pl.Duration(time_unit=du_time_unit) # type: ignore[arg-type] if dtype == dtypes.List: return pl.List(narwhals_to_native_dtype(dtype.inner, version)) # type: ignore[union-attr] if dtype == dtypes.Struct: diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index f2425de5c..e10b5c529 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -9,14 +9,19 @@ from narwhals._spark_like.expr import SparkLikeExpr from narwhals._spark_like.utils import ExprKind from narwhals.utils import Implementation +from narwhals.utils import _parse_time_unit_and_time_zone +from narwhals.utils import dtype_matches_time_unit_and_time_zone from narwhals.utils import import_dtypes_module if TYPE_CHECKING: + from datetime import timezone + from pyspark.sql import Column from typing_extensions import Self from narwhals._spark_like.dataframe import SparkLikeLazyFrame from narwhals.dtypes import DType + from narwhals.typing import TimeUnit from narwhals.utils import Version @@ -114,6 +119,51 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: implementation=self._implementation, ) + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> SparkLikeSelector: + dtypes = import_dtypes_module(version=self._version) + time_units, time_zones = _parse_time_unit_and_time_zone( + time_unit=time_unit, time_zone=time_zone + ) + + def func(df: SparkLikeLazyFrame) -> list[Column]: + return [ + df._F.col(col) + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + def evalute_output_names(df: SparkLikeLazyFrame) -> Sequence[str]: + return [ + col + for col in df.columns + if dtype_matches_time_unit_and_time_zone( + dtype=df.schema[col], + dtypes=dtypes, + time_units=time_units, + time_zones=time_zones, + ) + ] + + return SparkLikeSelector( + func, + function_name="selector", + evaluate_output_names=evalute_output_names, + alias_output_names=None, + backend_version=self._backend_version, + expr_kind=ExprKind.TRANSFORM, + version=self._version, + implementation=self._implementation, + ) + class SparkLikeSelector(SparkLikeExpr): def __repr__(self: Self) -> str: # pragma: no cover diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 00e81ac9f..4a9361073 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -10,11 +10,12 @@ if TYPE_CHECKING: from typing import Iterator - from typing import Literal from typing import Sequence from typing_extensions import Self + from narwhals.typing import TimeUnit + def _validate_dtype(dtype: DType | type[DType]) -> None: if not isinstance_or_issubclass(dtype, DType): @@ -490,7 +491,7 @@ class Datetime(TemporalType): def __init__( self: Self, - time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_unit: TimeUnit = "us", time_zone: str | timezone | None = None, ) -> None: if time_unit not in {"s", "ms", "us", "ns"}: @@ -553,7 +554,7 @@ class Duration(TemporalType): def __init__( self: Self, - time_unit: Literal["us", "ns", "ms", "s"] = "us", + time_unit: TimeUnit = "us", ) -> None: if time_unit not in ("s", "ms", "us", "ns"): msg = ( diff --git a/narwhals/expr_dt.py b/narwhals/expr_dt.py index 9495bc4d4..912723652 100644 --- a/narwhals/expr_dt.py +++ b/narwhals/expr_dt.py @@ -2,13 +2,13 @@ from typing import TYPE_CHECKING from typing import Generic -from typing import Literal from typing import TypeVar if TYPE_CHECKING: from typing_extensions import Self from narwhals.expr import Expr + from narwhals.typing import TimeUnit ExprT = TypeVar("ExprT", bound="Expr") @@ -1409,7 +1409,7 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: aggregates=self._expr._aggregates, ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: + def timestamp(self: Self, time_unit: TimeUnit = "us") -> ExprT: """Return a timestamp in the given time unit. Arguments: diff --git a/narwhals/selectors.py b/narwhals/selectors.py index 5c0810360..3b74b5c49 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -9,9 +9,12 @@ from narwhals.utils import flatten if TYPE_CHECKING: + from datetime import timezone + from typing_extensions import Self from narwhals.dtypes import DType + from narwhals.typing import TimeUnit class Selector(Expr): @@ -474,11 +477,75 @@ def all() -> Selector: ) +def datetime( + time_unit: TimeUnit | Iterable[TimeUnit] | None = None, + time_zone: str | timezone | Iterable[str | timezone | None] | None = ("*", None), +) -> Selector: + """Select all datetime columns, optionally filtering by time unit/zone. + + Arguments: + time_unit: One (or more) of the allowed timeunit precision strings, "ms", "us", + "ns" and "s". Omit to select columns with any valid timeunit. + time_zone: Specify which timezone(s) to select: + + * One or more timezone strings, as defined in zoneinfo (to see valid options + run `import zoneinfo; zoneinfo.available_timezones()` for a full list). + * Set `None` to select Datetime columns that do not have a timezone. + * Set `"*"` to select Datetime columns that have *any* timezone. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime, timezone + >>> import pyarrow as pa + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> + >>> utc_tz = timezone.utc + >>> data = { + ... "tstamp_utc": [ + ... datetime(2023, 4, 10, 12, 14, 16, 999000, tzinfo=utc_tz), + ... datetime(2025, 8, 25, 14, 18, 22, 666000, tzinfo=utc_tz), + ... ], + ... "tstamp": [ + ... datetime(2000, 11, 20, 18, 12, 16, 600000), + ... datetime(2020, 10, 30, 10, 20, 25, 123000), + ... ], + ... "numeric": [3.14, 6.28], + ... } + >>> df_native = pa.table(data) + >>> df_nw = nw.from_native(df_native) + >>> df_nw.select(ncs.datetime()).to_native() + pyarrow.Table + tstamp_utc: timestamp[us, tz=UTC] + tstamp: timestamp[us] + ---- + tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] + tstamp: [[2000-11-20 18:12:16.600000,2020-10-30 10:20:25.123000]] + + Select only datetime columns that have any time_zone specification: + + >>> df_nw.select(ncs.datetime(time_zone="*")).to_native() + pyarrow.Table + tstamp_utc: timestamp[us, tz=UTC] + ---- + tstamp_utc: [[2023-04-10 12:14:16.999000Z,2025-08-25 14:18:22.666000Z]] + """ + return Selector( + lambda plx: plx.selectors.datetime(time_unit=time_unit, time_zone=time_zone), + is_order_dependent=False, + changes_length=False, + aggregates=False, + ) + + __all__ = [ "all", "boolean", "by_dtype", "categorical", + "datetime", "matches", "numeric", "string", diff --git a/narwhals/series_dt.py b/narwhals/series_dt.py index 5fea4ff5c..10f53128c 100644 --- a/narwhals/series_dt.py +++ b/narwhals/series_dt.py @@ -3,13 +3,13 @@ from typing import TYPE_CHECKING from typing import Any from typing import Generic -from typing import Literal from typing import TypeVar if TYPE_CHECKING: from typing_extensions import Self from narwhals.series import Series + from narwhals.typing import TimeUnit SeriesT = TypeVar("SeriesT", bound="Series[Any]") @@ -1212,7 +1212,7 @@ def convert_time_zone(self: Self, time_zone: str) -> SeriesT: self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) ) - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> SeriesT: + def timestamp(self: Self, time_unit: TimeUnit) -> SeriesT: """Return a timestamp in the given time unit. Arguments: diff --git a/narwhals/stable/v1/selectors.py b/narwhals/stable/v1/selectors.py index f938dfa35..157e3d3f4 100644 --- a/narwhals/stable/v1/selectors.py +++ b/narwhals/stable/v1/selectors.py @@ -4,6 +4,7 @@ from narwhals.selectors import boolean from narwhals.selectors import by_dtype from narwhals.selectors import categorical +from narwhals.selectors import datetime from narwhals.selectors import matches from narwhals.selectors import numeric from narwhals.selectors import string @@ -13,6 +14,7 @@ "boolean", "by_dtype", "categorical", + "datetime", "matches", "numeric", "string", diff --git a/narwhals/typing.py b/narwhals/typing.py index 1a56623ea..cd6caa984 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -258,6 +258,8 @@ def __native_namespace__(self) -> ModuleType: ... "terabytes", ] +TimeUnit: TypeAlias = Literal["ns", "us", "ms", "s"] + _ShapeT = TypeVar("_ShapeT", bound="tuple[int, ...]") _NDArray: TypeAlias = "np.ndarray[_ShapeT, Any]" _1DArray: TypeAlias = "_NDArray[tuple[int]]" # noqa: PYI042, PYI047 diff --git a/narwhals/utils.py b/narwhals/utils.py index c2ba24091..adb3d41f3 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -2,6 +2,7 @@ import os import re +from datetime import timezone from enum import Enum from enum import auto from secrets import token_hex @@ -45,6 +46,7 @@ from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame + from narwhals.dtypes import DType from narwhals.series import Series from narwhals.typing import CompliantDataFrame from narwhals.typing import CompliantExpr @@ -56,6 +58,7 @@ from narwhals.typing import IntoSeriesT from narwhals.typing import SizeUnit from narwhals.typing import SupportsNativeNamespace + from narwhals.typing import TimeUnit FrameOrSeriesT = TypeVar( "FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]] @@ -1189,6 +1192,43 @@ def check_column_names_are_unique(columns: list[str]) -> None: raise DuplicateError(msg) +def _parse_time_unit_and_time_zone( + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, +) -> tuple[set[str], set[str | None]]: + time_units = ( + {"ms", "us", "ns", "s"} + if time_unit is None + else {time_unit} + if isinstance(time_unit, str) + else set(time_unit) + ) + time_zones: set[str | None] = ( + {None} + if time_zone is None + else {str(time_zone)} + if isinstance(time_zone, (str, timezone)) + else {str(tz) if tz is not None else None for tz in time_zone} + ) + return time_units, time_zones + + +def dtype_matches_time_unit_and_time_zone( + dtype: DType, + dtypes: DTypes, + time_units: set[str], + time_zones: set[str | None], +) -> bool: + return ( + (dtype == dtypes.Datetime) + and (dtype.time_unit in time_units) # type: ignore[attr-defined] + and ( + dtype.time_zone in time_zones # type: ignore[attr-defined] + or ("*" in time_zones and dtype.time_zone is not None) # type: ignore[attr-defined] + ) + ) + + def is_compliant_dataframe(obj: Any) -> TypeIs[CompliantDataFrame]: return hasattr(obj, "__narwhals_dataframe__") diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 20a75d22b..c7703c5d4 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -2,17 +2,14 @@ import re from datetime import datetime +from datetime import timezone +from typing import Literal import pytest import narwhals.stable.v1 as nw -from narwhals.stable.v1.selectors import all -from narwhals.stable.v1.selectors import boolean -from narwhals.stable.v1.selectors import by_dtype -from narwhals.stable.v1.selectors import categorical -from narwhals.stable.v1.selectors import matches -from narwhals.stable.v1.selectors import numeric -from narwhals.stable.v1.selectors import string +import narwhals.stable.v1.selectors as ncs +from tests.utils import PANDAS_VERSION from tests.utils import POLARS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor @@ -36,14 +33,14 @@ def test_selectors(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) - result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) + result = df.select(ncs.by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) def test_matches(constructor: Constructor) -> None: df = nw.from_native(constructor(data_regex)) - result = df.select(matches("[^z]a") + 1) + result = df.select(ncs.matches("[^z]a") + 1) expected = { "bar": [124, 457], "baz": [3.0, 6.5], @@ -53,21 +50,21 @@ def test_matches(constructor: Constructor) -> None: def test_numeric(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) - result = df.select(numeric() + 1) + result = df.select(ncs.numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) def test_boolean(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) - result = df.select(boolean()) + result = df.select(ncs.boolean()) expected = {"d": [True, False, True]} assert_equal_data(result, expected) def test_string(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) - result = df.select(string()) + result = df.select(ncs.string()) expected = {"b": ["a", "b", "c"]} assert_equal_data(result, expected) @@ -85,22 +82,137 @@ def test_categorical( expected = {"b": ["a", "b", "c"]} df = nw.from_native(constructor(data)).with_columns(nw.col("b").cast(nw.Categorical)) - result = df.select(categorical()) + result = df.select(ncs.categorical()) assert_equal_data(result, expected) +def test_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ( + "pyspark" in str(constructor) + or "duckdb" in str(constructor) + or "dask" in str(constructor) + or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) + or ("pyarrow" in str(constructor) and is_windows()) + or ("pandas" in str(constructor) and PANDAS_VERSION < (2,)) + ): + request.applymarker(pytest.mark.xfail) + + ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) + ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) + + data = { + "numeric": [3.14, 6.28], + "ts": [ts1, ts2], + } + time_units: list[Literal["ns", "us", "ms", "s"]] = ["ms", "us", "ns"] + + df = nw.from_native(constructor(data)).select( + nw.col("numeric"), + *[ + nw.col("ts").cast(nw.Datetime(time_unit=tu)).alias(f"ts_{tu}") + for tu in time_units + ], + *[ + nw.col("ts") + .dt.replace_time_zone("Europe/Lisbon") + .cast(nw.Datetime(time_zone="Europe/Lisbon", time_unit=tu)) + .alias(f"ts_lisbon_{tu}") + for tu in time_units + ], + *[ + nw.col("ts") + .dt.replace_time_zone("Europe/Berlin") + .cast(nw.Datetime(time_zone="Europe/Berlin", time_unit=tu)) + .alias(f"ts_berlin_{tu}") + for tu in time_units + ], + ) + + assert df.select(ncs.datetime()).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + "ts_lisbon_ms", + "ts_lisbon_us", + "ts_lisbon_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + assert df.select(ncs.datetime(time_unit="ms")).collect_schema().names() == [ + "ts_ms", + "ts_lisbon_ms", + "ts_berlin_ms", + ] + assert df.select(ncs.datetime(time_unit=["us", "ns"])).collect_schema().names() == [ + "ts_us", + "ts_ns", + "ts_lisbon_us", + "ts_lisbon_ns", + "ts_berlin_us", + "ts_berlin_ns", + ] + + assert df.select(ncs.datetime(time_zone=None)).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + ] + assert df.select(ncs.datetime(time_zone="*")).collect_schema().names() == [ + "ts_lisbon_ms", + "ts_lisbon_us", + "ts_lisbon_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + assert df.select( + ncs.datetime(time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == [ + "ts_ms", + "ts_us", + "ts_ns", + "ts_berlin_ms", + "ts_berlin_us", + "ts_berlin_ns", + ] + + assert df.select( + ncs.datetime(time_unit="ns", time_zone=[None, "Europe/Berlin"]) + ).collect_schema().names() == ["ts_ns", "ts_berlin_ns"] + + assert df.with_columns( + nw.col("ts_ms").dt.replace_time_zone("UTC").alias("ts_utc") + ).select( + ncs.datetime(time_unit=["ms", "us"], time_zone=[None, timezone.utc]) + ).collect_schema().names() == ["ts_ms", "ts_us", "ts_utc"] + + +def test_datetime_no_tz(constructor: Constructor) -> None: + ts1 = datetime(2000, 11, 20, 18, 12, 16, 600000) + ts2 = datetime(2020, 10, 30, 10, 20, 25, 123000) + + data = { + "numeric": [3.14, 6.28], + "ts": [ts1, ts2], + } + + df = nw.from_native(constructor(data)) + assert df.select(ncs.datetime()).collect_schema().names() == ["ts"] + + @pytest.mark.parametrize( ("selector", "expected"), [ - (numeric() | boolean(), ["a", "c", "d"]), - (numeric() & boolean(), []), - (numeric() & by_dtype(nw.Int64), ["a"]), - (numeric() | by_dtype(nw.Int64), ["a", "c"]), - (~numeric(), ["b", "d"]), - (boolean() & True, ["d"]), - (boolean() | True, ["d"]), - (numeric() - 1, ["a", "c"]), - (all(), ["a", "b", "c", "d"]), + (ncs.numeric() | ncs.boolean(), ["a", "c", "d"]), + (ncs.numeric() & ncs.boolean(), []), + (ncs.numeric() & ncs.by_dtype(nw.Int64), ["a"]), + (ncs.numeric() | ncs.by_dtype(nw.Int64), ["a", "c"]), + (~ncs.numeric(), ["b", "d"]), + (ncs.boolean() & True, ["d"]), + (ncs.boolean() | True, ["d"]), + (ncs.numeric() - 1, ["a", "c"]), + (ncs.all(), ["a", "b", "c", "d"]), ], ) def test_set_ops( @@ -126,7 +238,7 @@ def test_subtract_expr( # subtracting it. request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select(numeric() - nw.col("a")) + result = df.select(ncs.numeric() - nw.col("a")) expected = {"a": [0, 0, 0], "c": [3.1, 4.0, 4.0]} assert_equal_data(result, expected) @@ -134,17 +246,17 @@ def test_subtract_expr( def test_set_ops_invalid(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 - numeric()) + df.select(1 - ncs.numeric()) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 | numeric()) + df.select(1 | ncs.numeric()) with pytest.raises((NotImplementedError, ValueError)): - df.select(1 & numeric()) + df.select(1 & ncs.numeric()) with pytest.raises( TypeError, match=re.escape("unsupported operand type(s) for op: ('Selector' + 'Selector')"), ): - df.select(boolean() + numeric()) + df.select(ncs.boolean() + ncs.numeric()) @pytest.mark.skipif(is_windows(), reason="windows is what it is")