From 1e0d4ae77942eb9f40bcd1139673ca5466898d20 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Wed, 6 Nov 2024 16:01:08 +0100 Subject: [PATCH 01/15] WIP --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 10 ++++ narwhals/_arrow/series.py | 19 +++++++ narwhals/_pandas_like/expr.py | 10 ++++ narwhals/_pandas_like/series.py | 14 +++++ narwhals/expr.py | 91 +++++++++++++++++++++++++++++++++ narwhals/series.py | 51 ++++++++++++++++++ 8 files changed, 197 insertions(+) diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 7188b2c36c..1ee8cae4f9 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -36,6 +36,7 @@ - over - pipe - quantile + - rank - round - sample - shift diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e8572dda8d..0dda8107c1 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -43,6 +43,7 @@ - null_count - pipe - quantile + - rank - rename - round - sample diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 35e936d72f..52b1c6875b 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -372,6 +372,16 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: def mode(self: Self) -> Self: return reuse_series_implementation(self, "mode") + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 70009df43c..3610f1f41c 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -721,6 +721,25 @@ def mode(self: Self) -> ArrowSeries: plx.col(col_token) == plx.col(col_token).max() )[self.name] + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method + native_series = self._native_series + null_mask = pc.is_null(native_series) + + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index a58597eea8..bc87a38337 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -387,6 +387,16 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: def mode(self: Self) -> Self: return reuse_series_implementation(self, "mode") + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 078e857b9e..057237b73c 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -682,6 +682,20 @@ def mode(self: Self) -> Self: def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + result = self._native_series.rank( + method="first" if method == "ordinal" else method, + na_option="keep", + ascending=not descending, + pct=False, + ) + return self._from_native_series(result) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 2f986760c8..07e2fc92af 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2310,6 +2310,97 @@ def mode(self: Self) -> Self: """ return self.__class__(lambda plx: self._call(plx).mode()) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """ + Assign ranks to data, dealing with ties appropriately. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Examples + -------- + The 'average' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The 'ordinal' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use 'rank' with 'over' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + """ + + supported_rank_methods = {"average", "min", "max", "dense"} + if method not in supported_rank_methods: + msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" + raise ValueError(msg) + + return self.__class__( + lambda plx: self._call(plx).rank(method=method, descending=descending) + ) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index dac5c6d795..868a9eb5c0 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2525,6 +2525,57 @@ def mode(self: Self) -> Self: def __iter__(self: Self) -> Iterator[Any]: yield from self._compliant_series.__iter__() + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """ + Assign ranks to data, dealing with ties appropriately. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Examples: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: 'a' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + """ + supported_rank_methods = {"average", "min", "max", "dense"} + if method not in supported_rank_methods: + msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" + raise ValueError(msg) + + return self._from_compliant_series( + self._compliant_series.rank(method=method, descending=descending) + ) + @property def str(self: Self) -> SeriesStringNamespace[Self]: return SeriesStringNamespace(self) From ebf4321a0c4f98a76f02fde380575f95080ea639 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 7 Nov 2024 15:07:31 +0100 Subject: [PATCH 02/15] WIP --- narwhals/_arrow/series.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 3610f1f41c..48b90c118b 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -730,15 +730,18 @@ def rank( import pyarrow as pa # ignore-banned-import import pyarrow.compute as pc # ignore-banned-import - sort_keys = "descending" if descending else "ascending" - tiebreaker = "first" if method == "ordinal" else method - native_series = self._native_series - null_mask = pc.is_null(native_series) + if method != "average": + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method + native_series = self._native_series + null_mask = pc.is_null(native_series) - rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) - result = pc.if_else(null_mask, pa.scalar(None), rank) - return self._from_native_series(result) + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) + else: + pass def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() From e60214d817c3afcfaf0e3f5f7d4266edd996eac3 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 8 Nov 2024 09:51:45 +0100 Subject: [PATCH 03/15] WIPWIP --- narwhals/_arrow/series.py | 25 +++++--- narwhals/expr.py | 98 ++++++++++++++---------------- narwhals/series.py | 64 ++++++++++++++----- tests/expr_and_series/rank_test.py | 98 ++++++++++++++++++++++++++++++ 4 files changed, 210 insertions(+), 75 deletions(-) create mode 100644 tests/expr_and_series/rank_test.py diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 48b90c118b..3278920a56 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -727,21 +727,26 @@ def rank( *, descending: bool, ) -> Self: + if method == "average": + msg = ( + "`rank` with `method='average' is not supported for pyarrow backend. " + "The available methods are {'min', 'max', 'dense', 'ordinal'}." + ) + raise ValueError(msg) + import pyarrow as pa # ignore-banned-import import pyarrow.compute as pc # ignore-banned-import - if method != "average": - sort_keys = "descending" if descending else "ascending" - tiebreaker = "first" if method == "ordinal" else method - native_series = self._native_series - null_mask = pc.is_null(native_series) + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method - rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + native_series = self._native_series + null_mask = pc.is_null(native_series) - result = pc.if_else(null_mask, pa.scalar(None), rank) - return self._from_native_series(result) - else: - pass + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() diff --git a/narwhals/expr.py b/narwhals/expr.py index 07e2fc92af..c0c83714f0 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2319,6 +2319,9 @@ def rank( """ Assign ranks to data, dealing with ties appropriately. + Notes: + The resulting dtype may differ between backends. + Arguments: method: The method used to assign ranks to tied elements. The following methods are available (default is 'average'): @@ -2338,61 +2341,54 @@ def rank( descending: Rank in descending order. - Examples - -------- - The 'average' method: + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = {"a": [3, 6, 1, 1, 6]} - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ + We define a dataframe-agnostic function that computes the dense rank for + the data: - The 'ordinal' method: + >>> @nw.narwhalify + ... def func(df): + ... return df.with_columns(rnk=nw.col("a").rank(method="dense")) - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ + We can then pass any supported library such as pandas, Polars, or PyArrow: + + >>> func(pl.DataFrame(data)) + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ rnk │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + │ 6 ┆ 3 │ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 6 ┆ 3 │ + └─────┴─────┘ + + >>> func(pd.DataFrame(data)) + a rnk + 0 3 2.0 + 1 6 3.0 + 2 1 1.0 + 3 1 1.0 + 4 6 3.0 + + >>> func(pa.table(data)) + pyarrow.Table + a: int64 + rnk: uint64 + ---- + a: [[3,6,1,1,6]] + rnk: [[2,3,1,1,3]] + """ - Use 'rank' with 'over' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - """ - - supported_rank_methods = {"average", "min", "max", "dense"} + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} if method not in supported_rank_methods: msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" raise ValueError(msg) diff --git a/narwhals/series.py b/narwhals/series.py index 868a9eb5c0..7aa921eec9 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2534,6 +2534,9 @@ def rank( """ Assign ranks to data, dealing with ties appropriately. + Notes: + The resulting dtype may differ between backends. + Arguments: method: The method used to assign ranks to tied elements. The following methods are available (default is 'average'): @@ -2554,20 +2557,53 @@ def rank( descending: Rank in descending order. Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [3, 6, 1, 1, 6] + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> @nw.narwhalify + ... def func(s): + ... return s.rank(method="dense") - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: 'a' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - """ - supported_rank_methods = {"average", "min", "max", "dense"} + We can then pass any supported library such as pandas, Polars, or PyArrow: + + >>> func(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [u32] + [ + 2 + 3 + 1 + 1 + 3 + ] + + >>> func(pd.Series(data)) + 0 2.0 + 1 3.0 + 2 1.0 + 3 1.0 + 4 3.0 + dtype: float64 + + >>> func(pa.chunked_array([data])) # doctest:+ELLIPSIS + + [ + [ + 2, + 3, + 1, + 1, + 3 + ] + ] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} if method not in supported_rank_methods: msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" raise ValueError(msg) @@ -3220,7 +3256,7 @@ def to_datetime(self: Self, format: str | None = None) -> T: # noqa: A002 ... def func(s): ... return s.str.to_datetime(format="%Y-%m-%d") - We can then pass any supported library such as pandas, Polars, or PyArrow:: + We can then pass any supported library such as pandas, Polars, or PyArrow: >>> func(s_pd) 0 2020-01-01 diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py new file mode 100644 index 0000000000..090605bf48 --- /dev/null +++ b/tests/expr_and_series/rank_test.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from contextlib import nullcontext as does_not_raise +from typing import Literal + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +rank_methods = ["average", "min", "max", "dense", "ordinal"] + +data = {"a": [3, 6, 1, 1, None, 6], "b": [1, 1, 2, 1, 2, 2]} + +expected = { + "average": [3.0, 4.5, 1.5, 1.5, float("nan"), 4.5], + "min": [3, 4, 1, 1, float("nan"), 4], + "max": [3, 5, 2, 2, float("nan"), 5], + "dense": [2, 3, 1, 1, float("nan"), 3], + "ordinal": [3, 4, 1, 2, float("nan"), 5], +} + +expected_over = { + "average": [2.0, 3.0, 1.0, 1.0, float("nan"), 2.0], + "min": [2, 3, 1, 1, float("nan"), 2], + "max": [2, 3, 1, 1, float("nan"), 2], + "dense": [2, 3, 1, 1, float("nan"), 2], + "ordinal": [2, 3, 1, 1, float("nan"), 2], +} + + +@pytest.mark.parametrize("method", rank_methods) +def test_rank_expr( + request: pytest.FixtureRequest, + constructor: Constructor, + method: Literal["average", "min", "max", "dense", "ordinal"], +) -> None: + if "dask" in str(constructor): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor(data)) + + result = df.select(nw.col("a").rank(method=method)) + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +def test_rank_series( + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], +) -> None: + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result = {"a": df["a"].rank(method=method)} + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +def test_rank_expr_in_over_context( + request: pytest.FixtureRequest, + constructor: Constructor, + method: Literal["average", "min", "max", "dense", "ordinal"], +) -> None: + if "pyarrow_table" in str(constructor) or "dask" in str(constructor): + # Pyarrow raises: + # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank + # We can handle that to provide a better error message. + request.applymarker(pytest.mark.xfail) + + df = nw.from_native(constructor(data)) + + result = df.select(nw.col("a").rank(method=method).over("b")) + expected_data = {"a": expected_over[method]} + assert_equal_data(result, expected_data) From cbc13b55b456d37e3294c74f67e20a9e3ff2697e Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 9 Nov 2024 19:46:06 +0100 Subject: [PATCH 04/15] pandas int workaround --- narwhals/_pandas_like/series.py | 50 +++++++++++++++++++++++++----- tests/expr_and_series/rank_test.py | 11 +++++-- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 614b4a7951..97c7720c32 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -725,13 +725,49 @@ def rank( *, descending: bool, ) -> Self: - result = self._native_series.rank( - method="first" if method == "ordinal" else method, - na_option="keep", - ascending=not descending, - pct=False, - ) - return self._from_native_series(result) + pd_method = "first" if method == "ordinal" else method + native_series = self._native_series + + if ( + self._implementation is Implementation.PANDAS + and self._backend_version < (3,) + and self.dtype + in { + self._dtypes.Int64, + self._dtypes.Int32, + self._dtypes.Int16, + self._dtypes.Int8, + self._dtypes.UInt64, + self._dtypes.UInt32, + self._dtypes.UInt16, + self._dtypes.UInt8, + } + and (null_mask := native_series.isna()).any() + ): + # crazy workaround for the case of `na_option="keep"` and nullable + # integer dtypes. This should be supported in pandas > 3.0 + # https://github.com/pandas-dev/pandas/issues/56976 + ranked_series = ( + native_series.to_frame() + .assign(**{f"{native_series.name}_is_null": null_mask}) + .groupby(f"{native_series.name}_is_null") + .rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + )[native_series.name] + ) + + else: + ranked_series = native_series.rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + ) + + return self._from_native_series(ranked_series) @property def str(self) -> PandasLikeSeriesStringNamespace: diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index 090605bf48..695e4a37a5 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -12,7 +12,8 @@ rank_methods = ["average", "min", "max", "dense", "ordinal"] -data = {"a": [3, 6, 1, 1, None, 6], "b": [1, 1, 2, 1, 2, 2]} +data_int = {"a": [3, 6, 1, 1, None, 6], "b": [1, 1, 2, 1, 2, 2]} +data_float = {"a": [3.1, 6.1, 1.5, 1.5, None, 6.1], "b": [1, 1, 2, 1, 2, 2]} expected = { "average": [3.0, 4.5, 1.5, 1.5, float("nan"), 4.5], @@ -32,10 +33,12 @@ @pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) def test_rank_expr( request: pytest.FixtureRequest, constructor: Constructor, method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, float], ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) @@ -58,9 +61,11 @@ def test_rank_expr( @pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) def test_rank_series( constructor_eager: ConstructorEager, method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, float], ) -> None: context = ( pytest.raises( @@ -85,13 +90,13 @@ def test_rank_expr_in_over_context( constructor: Constructor, method: Literal["average", "min", "max", "dense", "ordinal"], ) -> None: - if "pyarrow_table" in str(constructor) or "dask" in str(constructor): + if "polars" not in str(constructor): # Pyarrow raises: # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank # We can handle that to provide a better error message. request.applymarker(pytest.mark.xfail) - df = nw.from_native(constructor(data)) + df = nw.from_native(constructor(data_int)) result = df.select(nw.col("a").rank(method=method).over("b")) expected_data = {"a": expected_over[method]} From 8b492d59b761d1a19d2f34fb5afbf16fdccface8 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sat, 9 Nov 2024 20:51:37 +0100 Subject: [PATCH 05/15] comma? --- narwhals/expr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index bd38d63942..823f985fe0 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1003,7 +1003,8 @@ def replace_strict( ... def func(df): ... return df.with_columns( ... b=nw.col("a").replace_strict( - ... [0,1,2,3], ['zero', 'one', 'two', 'three'] + ... [0, 1, 2, 3], + ... ["zero", "one", "two", "three"], ... return_dtype=nw.String, ... ) ... ) From 4c8cc1b9090fc315126a90b292b6ed17ac51deb1 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 10 Nov 2024 11:07:55 +0100 Subject: [PATCH 06/15] merge main, test invalid method --- narwhals/expr.py | 5 ++++- narwhals/series.py | 5 ++++- tests/expr_and_series/rank_test.py | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 089e56fcfe..8c6fc8b276 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2528,7 +2528,10 @@ def rank( supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} if method not in supported_rank_methods: - msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) raise ValueError(msg) return self.__class__( diff --git a/narwhals/series.py b/narwhals/series.py index b49e2d0931..5f5a27c9c6 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2731,7 +2731,10 @@ def rank( """ supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} if method not in supported_rank_methods: - msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) raise ValueError(msg) return self._from_compliant_series( diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index 695e4a37a5..259bfbd5dc 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -101,3 +101,19 @@ def test_rank_expr_in_over_context( result = df.select(nw.col("a").rank(method=method).over("b")) expected_data = {"a": expected_over[method]} assert_equal_data(result, expected_data) + + +def test_invalid_method_raise(constructor: Constructor) -> None: + method = "invalid_method_name" + df = nw.from_native(constructor(data_float)) + + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + + with pytest.raises(ValueError, match=msg): + df.select(nw.col("a").rank(method=method)) # type: ignore[arg-type] + + with pytest.raises(ValueError, match=msg): + df.lazy().collect()["a"].rank(method=method) # type: ignore[arg-type] From ec0f8a7ccf72bdb1b63b112e545b9c1fe0dc5a50 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 10 Nov 2024 11:20:37 +0100 Subject: [PATCH 07/15] old pyarrow --- narwhals/_arrow/series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index fc095715b6..dcbfa855ed 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -777,6 +777,9 @@ def rank( tiebreaker = "first" if method == "ordinal" else method native_series = self._native_series + if self._backend_version < (14, 0, 0): # pragma: no cover + native_series = native_series.combine_chunks() + null_mask = pc.is_null(native_series) rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) From e8989e35bcd5a7358ed12914b3cc5455ee365893 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Wed, 18 Dec 2024 11:21:37 +0100 Subject: [PATCH 08/15] WIP --- narwhals/_pandas_like/expr.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 5c1ab22024..eb9e4e56bd 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -33,6 +33,7 @@ # Pandas cumcount counts nulls while Polars does not # So, instead of using "cumcount" we use "cumsum" on notna() to get the same result "col->cum_count": "cumsum", + "col->rank": "rank", } @@ -411,7 +412,7 @@ def alias(self, name: str) -> Self: version=self._version, ) - def over(self, keys: list[str]) -> Self: + def over(self: Self, keys: list[str]) -> Self: if self._function_name in CUMULATIVE_FUNCTIONS_TO_PANDAS_EQUIVALENT: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: @@ -430,11 +431,12 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: plx = self.__narwhals_namespace__() df = df.with_columns(~plx.col(*self._root_names).is_null()) - res_native = df._native_frame.groupby(list(keys), as_index=False)[ - self._root_names - ].transform( - CUMULATIVE_FUNCTIONS_TO_PANDAS_EQUIVALENT[self._function_name] - ) + res_native = getattr( + df._native_frame.groupby(list(keys), as_index=False)[ + self._root_names + ], + CUMULATIVE_FUNCTIONS_TO_PANDAS_EQUIVALENT[self._function_name], + )() result_frame = df._from_native_frame( rename( res_native, From 07571c89bc5f6dc43619567d00810c2cd4c6eaf5 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 26 Dec 2024 20:22:16 +0100 Subject: [PATCH 09/15] fail pandas_pyarrow for pandas < (2,1) --- tests/expr_and_series/rank_test.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index 9d3a60a162..e64d58574e 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -6,6 +6,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -40,7 +41,9 @@ def test_rank_expr( method: Literal["average", "min", "max", "dense", "ordinal"], data: dict[str, float], ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or ( + "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1) + ): request.applymarker(pytest.mark.xfail) context = ( @@ -63,10 +66,14 @@ def test_rank_expr( @pytest.mark.parametrize("method", rank_methods) @pytest.mark.parametrize("data", [data_int, data_float]) def test_rank_series( + request: pytest.FixtureRequest, constructor_eager: ConstructorEager, method: Literal["average", "min", "max", "dense", "ordinal"], data: dict[str, float], ) -> None: + if "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1): + request.applymarker(pytest.mark.xfail) + context = ( pytest.raises( ValueError, @@ -96,10 +103,10 @@ def test_rank_expr_in_over_context( # We can handle that to provide a better error message. request.applymarker(pytest.mark.xfail) + if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): + request.applymarker(pytest.mark.xfail) + if method == "ordinal" and "polars" not in str(constructor): - # Pyarrow raises: - # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank - # We can handle that to provide a better error message. request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_int)) From 96520ae6061c9b22e762f2d2a7b4d6f1031cfc9f Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 27 Dec 2024 08:54:36 +0100 Subject: [PATCH 10/15] xfail int only --- tests/expr_and_series/rank_test.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index e64d58574e..db589e7e36 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -39,10 +39,15 @@ def test_rank_expr( request: pytest.FixtureRequest, constructor: Constructor, method: Literal["average", "min", "max", "dense", "ordinal"], - data: dict[str, float], + data: dict[str, list[float]], ) -> None: - if "dask" in str(constructor) or ( - "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1) + if "dask" in str(constructor): + request.applymarker(pytest.mark.xfail) + + if ( + "pandas_pyarrow" in str(constructor) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) ): request.applymarker(pytest.mark.xfail) @@ -69,9 +74,13 @@ def test_rank_series( request: pytest.FixtureRequest, constructor_eager: ConstructorEager, method: Literal["average", "min", "max", "dense", "ordinal"], - data: dict[str, float], + data: dict[str, list[float]], ) -> None: - if "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1): + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): request.applymarker(pytest.mark.xfail) context = ( From 6ad961e5efae8b96d944eeef8030a184dd243ee6 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 27 Dec 2024 08:58:03 +0100 Subject: [PATCH 11/15] fix options in over --- tests/expr_and_series/rank_test.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index db589e7e36..16f95a82ba 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -115,10 +115,7 @@ def test_rank_expr_in_over_context( if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) - if method == "ordinal" and "polars" not in str(constructor): - request.applymarker(pytest.mark.xfail) - - df = nw.from_native(constructor(data_int)) + df = nw.from_native(constructor(data_float)) result = df.select(nw.col("a").rank(method=method).over("b")) expected_data = {"a": expected_over[method]} From 5c565a4fe1cb4238ebd5cfeec47a153392dfce76 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Fri, 27 Dec 2024 09:00:41 +0100 Subject: [PATCH 12/15] forgot a file --- narwhals/_pandas_like/expr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 298e152d4c..e76c1940c2 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -448,9 +448,12 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: if self._function_name == "col->shift": kwargs = {"periods": self._kwargs.get("n", 1)} elif self._function_name == "col->rank": + _method = self._kwargs.get("method", "average") kwargs = { - "method": self._kwargs.get("method", "average"), + "method": "first" if _method == "ordinal" else _method, "ascending": not self._kwargs.get("descending", False), + "na_option": "keep", + "pct": False, } else: # Cumulative operation kwargs = {"skipna": True} From 1b550e42454e0c216ac0e2643dc1661b14725e47 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 30 Dec 2024 16:29:39 +0100 Subject: [PATCH 13/15] merge main and better return docstring --- narwhals/expr.py | 2 +- narwhals/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index ea8e326a60..e8794c987c 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3826,7 +3826,7 @@ def rank( descending: Rank in descending order. Returns: - A new expression. + A new expression with rank data. Examples: >>> import narwhals as nw diff --git a/narwhals/series.py b/narwhals/series.py index 8db9308c56..db7c0120f8 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3977,7 +3977,7 @@ def rank( descending: Rank in descending order. Returns: - A new series + A new series with rank data as values. Examples: >>> import narwhals as nw From b68f575d0884a659859a6dbaae7805dd913d7103 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 2 Jan 2025 14:47:59 +0100 Subject: [PATCH 14/15] float(nan) -> None --- tests/expr_and_series/rank_test.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index 16f95a82ba..b3de335d77 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -17,19 +17,19 @@ data_float = {"a": [3.1, 6.1, 1.5, 1.5, None, 6.1], "b": [1, 1, 2, 1, 2, 2]} expected = { - "average": [3.0, 4.5, 1.5, 1.5, float("nan"), 4.5], - "min": [3, 4, 1, 1, float("nan"), 4], - "max": [3, 5, 2, 2, float("nan"), 5], - "dense": [2, 3, 1, 1, float("nan"), 3], - "ordinal": [3, 4, 1, 2, float("nan"), 5], + "average": [3.0, 4.5, 1.5, 1.5, None, 4.5], + "min": [3, 4, 1, 1, None, 4], + "max": [3, 5, 2, 2, None, 5], + "dense": [2, 3, 1, 1, None, 3], + "ordinal": [3, 4, 1, 2, None, 5], } expected_over = { - "average": [2.0, 3.0, 1.0, 1.0, float("nan"), 2.0], - "min": [2, 3, 1, 1, float("nan"), 2], - "max": [2, 3, 1, 1, float("nan"), 2], - "dense": [2, 3, 1, 1, float("nan"), 2], - "ordinal": [2, 3, 1, 1, float("nan"), 2], + "average": [2.0, 3.0, 1.0, 1.0, None, 2.0], + "min": [2, 3, 1, 1, None, 2], + "max": [2, 3, 1, 1, None, 2], + "dense": [2, 3, 1, 1, None, 2], + "ordinal": [2, 3, 1, 1, None, 2], } From 6c72df7e423b33ac4f47e7d507a476c80f322bec Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 7 Jan 2025 10:10:39 +0100 Subject: [PATCH 15/15] test eager only for rank --- tests/expr_and_series/rank_test.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py index b3de335d77..99a64371e6 100644 --- a/tests/expr_and_series/rank_test.py +++ b/tests/expr_and_series/rank_test.py @@ -7,7 +7,6 @@ import narwhals.stable.v1 as nw from tests.utils import PANDAS_VERSION -from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -37,15 +36,12 @@ @pytest.mark.parametrize("data", [data_int, data_float]) def test_rank_expr( request: pytest.FixtureRequest, - constructor: Constructor, + constructor_eager: ConstructorEager, method: Literal["average", "min", "max", "dense", "ordinal"], data: dict[str, list[float]], ) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) - if ( - "pandas_pyarrow" in str(constructor) + "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1) and isinstance(data["a"][0], int) ): @@ -56,12 +52,12 @@ def test_rank_expr( ValueError, match=r"`rank` with `method='average' is not supported for pyarrow backend.", ) - if "pyarrow_table" in str(constructor) and method == "average" + if "pyarrow_table" in str(constructor_eager) and method == "average" else does_not_raise() ) with context: - df = nw.from_native(constructor(data)) + df = nw.from_native(constructor_eager(data)) result = df.select(nw.col("a").rank(method=method)) expected_data = {"a": expected[method]} @@ -103,28 +99,28 @@ def test_rank_series( @pytest.mark.parametrize("method", rank_methods) def test_rank_expr_in_over_context( request: pytest.FixtureRequest, - constructor: Constructor, + constructor_eager: ConstructorEager, method: Literal["average", "min", "max", "dense", "ordinal"], ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask")): + if any(x in str(constructor_eager) for x in ("pyarrow_table", "dask")): # Pyarrow raises: # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank # We can handle that to provide a better error message. request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): + if "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) - df = nw.from_native(constructor(data_float)) + df = nw.from_native(constructor_eager(data_float)) result = df.select(nw.col("a").rank(method=method).over("b")) expected_data = {"a": expected_over[method]} assert_equal_data(result, expected_data) -def test_invalid_method_raise(constructor: Constructor) -> None: +def test_invalid_method_raise(constructor_eager: ConstructorEager) -> None: method = "invalid_method_name" - df = nw.from_native(constructor(data_float)) + df = nw.from_native(constructor_eager(data_float)) msg = ( "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. "