From 8164c5a3170156132c43954b3c1643a5be2314df Mon Sep 17 00:00:00 2001 From: Sam Turner <98767222+stmio@users.noreply.github.com> Date: Sun, 17 Sep 2023 13:15:50 +0100 Subject: [PATCH 1/5] Implement `GroupBy.value_counts` to match pandas --- python/cudf/cudf/core/groupby/groupby.py | 143 +++++++++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 45 +++++++ 2 files changed, 188 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b300c55b537..6a6ad8cf57f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2336,6 +2336,149 @@ def pct_change( shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 + def value_counts( + self, + subset=None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrameOrSeries: + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will + have an additional column with the value_counts. The column is + labelled 'count' or 'proportion', depending on the ``normalize`` + parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + + df = cudf.DataFrame.copy(self.obj) + groupings = self.grouping.names + + if subset is None: + subset = [i for i in df.columns if i not in groupings] + + df["placeholder"] = 1 + result = ( + df.groupby(groupings + subset, dropna=dropna) + .placeholder.count() + .sort_index() + .astype(np.int64) + ) + + if normalize: + levels = list(range(len(groupings), result.index.nlevels)) + result /= result.groupby( + result.index.droplevel(levels), + ).transform("sum") + + if sort: + result = result.sort_values(ascending=ascending).sort_index( + level=range(len(groupings)), sort_remaining=False + ) + + result.name = "proportion" if normalize else "count" + if not self._as_index: + result = result.to_frame().reset_index() + + return result + def _mimic_pandas_order( self, result: DataFrameOrSeries ) -> DataFrameOrSeries: diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 042f0e1aa38..9311d3d04c4 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3473,3 +3473,48 @@ def test_categorical_grouping_pandas_compatibility(): expected = pdf.groupby("key", sort=False).sum() assert_eq(actual, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", np.nan, "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = pdf.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_eq(actual, expected, check_names=False) + + +def test_group_by_value_counts_subset(): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender").value_counts(["education"]) + expected = pdf.groupby("gender").value_counts(["education"]) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_eq(actual, expected, check_names=False) From 7c912835d371edf09306dfb3aa5fa4741e0e528c Mon Sep 17 00:00:00 2001 From: Sam Turner <98767222+stmio@users.noreply.github.com> Date: Mon, 18 Sep 2023 10:56:16 +0000 Subject: [PATCH 2/5] Update tests for `GroupBy.value_counts` --- python/cudf/cudf/tests/test_groupby.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 9311d3d04c4..2d35c545bbe 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3499,7 +3499,9 @@ def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): ) # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` - assert_eq(actual, expected, check_names=False) + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) def test_group_by_value_counts_subset(): @@ -3517,4 +3519,6 @@ def test_group_by_value_counts_subset(): expected = pdf.groupby("gender").value_counts(["education"]) # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` - assert_eq(actual, expected, check_names=False) + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) From 485ad868a29feb44346a8a030ea6dcd0cc95dc40 Mon Sep 17 00:00:00 2001 From: Sam Turner <98767222+stmio@users.noreply.github.com> Date: Mon, 18 Sep 2023 16:53:18 +0100 Subject: [PATCH 3/5] Cover possible errors with `GroupBy.value_counts` --- python/cudf/cudf/core/groupby/groupby.py | 29 ++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6a6ad8cf57f..821bfceb5c4 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2450,14 +2450,29 @@ def value_counts( df = cudf.DataFrame.copy(self.obj) groupings = self.grouping.names + name = "proportion" if normalize else "count" if subset is None: subset = [i for i in df.columns if i not in groupings] + # Check subset exists in dataframe + elif set(subset) - set(df.columns): + raise ValueError( + f"Keys {set(subset) - set(df.columns)} in subset " + f"do not exist in the DataFrame." + ) + # Catch case where groupby and subset share an element + elif set(subset) & set(groupings): + raise ValueError( + f"Keys {set(subset) & set(groupings)} in subset " + "cannot be in the groupby column keys." + ) - df["placeholder"] = 1 + df["__placeholder"] = 1 result = ( - df.groupby(groupings + subset, dropna=dropna) - .placeholder.count() + df.groupby(groupings + list(subset), dropna=dropna)[ + "__placeholder" + ] + .count() .sort_index() .astype(np.int64) ) @@ -2473,9 +2488,15 @@ def value_counts( level=range(len(groupings)), sort_remaining=False ) - result.name = "proportion" if normalize else "count" if not self._as_index: + if name in df.columns: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + result.name = name result = result.to_frame().reset_index() + else: + result.name = name return result From 42c345745aed5d81d8df91bfaf50ffc496d26030 Mon Sep 17 00:00:00 2001 From: Sam Turner <98767222+stmio@users.noreply.github.com> Date: Mon, 18 Sep 2023 20:22:01 +0100 Subject: [PATCH 4/5] Add tests for error cases --- python/cudf/cudf/tests/test_groupby.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 2d35c545bbe..376639d5226 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3522,3 +3522,21 @@ def test_group_by_value_counts_subset(): assert_groupby_results_equal( actual, expected, check_names=False, check_index_type=False ) + + +def test_group_by_value_counts_clash_with_subset(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["a"]) + + +def test_group_by_value_counts_subset_not_exists(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["c"]) + + +def test_group_by_value_counts_with_count_column(): + df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a", as_index=False).value_counts() From b01b566eae0bf40b1c0d44afb2a70735fb8a9475 Mon Sep 17 00:00:00 2001 From: Sam Turner <98767222+stmio@users.noreply.github.com> Date: Tue, 19 Sep 2023 06:44:06 +0100 Subject: [PATCH 5/5] Use `._column_names` rather than `.columns` --- python/cudf/cudf/core/groupby/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 821bfceb5c4..e1740140b44 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2453,11 +2453,11 @@ def value_counts( name = "proportion" if normalize else "count" if subset is None: - subset = [i for i in df.columns if i not in groupings] + subset = [i for i in df._column_names if i not in groupings] # Check subset exists in dataframe - elif set(subset) - set(df.columns): + elif set(subset) - set(df._column_names): raise ValueError( - f"Keys {set(subset) - set(df.columns)} in subset " + f"Keys {set(subset) - set(df._column_names)} in subset " f"do not exist in the DataFrame." ) # Catch case where groupby and subset share an element @@ -2489,7 +2489,7 @@ def value_counts( ) if not self._as_index: - if name in df.columns: + if name in df._column_names: raise ValueError( f"Column label '{name}' is duplicate of result column" )