From 8164c5a3170156132c43954b3c1643a5be2314df Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Sun, 17 Sep 2023 13:15:50 +0100
Subject: [PATCH 1/5] Implement `GroupBy.value_counts` to match pandas

---
 python/cudf/cudf/core/groupby/groupby.py | 143 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   |  45 +++++++
 2 files changed, 188 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b300c55b537..6a6ad8cf57f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2336,6 +2336,149 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def value_counts(
+        self,
+        subset=None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> DataFrameOrSeries:
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        Parameters
+        ----------
+        subset : list-like, optional
+            Columns to use when counting unique combinations.
+        normalize : bool, default False
+            Return proportions rather than frequencies.
+        sort : bool, default True
+            Sort by frequencies.
+        ascending : bool, default False
+            Sort in ascending order.
+        dropna : bool, default True
+            Don't include counts of rows that contain NA values.
+
+        Returns
+        -------
+        Series or DataFrame
+            Series if the groupby as_index is True, otherwise DataFrame.
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+        DataFrame.value_counts: Equivalent method on DataFrame.
+        SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
+
+        Notes
+        -----
+        - If the groupby as_index is True then the returned Series will have a
+          MultiIndex with one level per input column.
+        - If the groupby as_index is False then the returned DataFrame will
+          have an additional column with the value_counts. The column is
+          labelled 'count' or 'proportion', depending on the ``normalize``
+          parameter.
+
+        By default, rows that contain any NA values are omitted from
+        the result.
+
+        By default, the result will be in descending order so that the
+        first element of each group is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({
+        ...    'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+        ...    'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+        ...    'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+        ... })
+
+        >>> df
+                gender  education   country
+        0       male    low         US
+        1       male    medium      FR
+        2       female  high        US
+        3       male    low         FR
+        4       female  high        FR
+        5       male    low         FR
+
+        >>> df.groupby('gender').value_counts()
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        FR         2
+                           US         1
+                medium     FR         1
+        Name: count, dtype: int64
+
+        >>> df.groupby('gender').value_counts(ascending=True)
+        gender  education  country
+        female  high       FR         1
+                           US         1
+        male    low        US         1
+                medium     FR         1
+                low        FR         2
+        Name: count, dtype: int64
+
+        >>> df.groupby('gender').value_counts(normalize=True)
+        gender  education  country
+        female  high       FR         0.50
+                           US         0.50
+        male    low        FR         0.50
+                           US         0.25
+                medium     FR         0.25
+        Name: proportion, dtype: float64
+
+        >>> df.groupby('gender', as_index=False).value_counts()
+           gender education country  count
+        0  female      high      FR      1
+        1  female      high      US      1
+        2    male       low      FR      2
+        3    male       low      US      1
+        4    male    medium      FR      1
+
+        >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
+           gender education country  proportion
+        0  female      high      FR        0.50
+        1  female      high      US        0.50
+        2    male       low      FR        0.50
+        3    male       low      US        0.25
+        4    male    medium      FR        0.25
+        """
+
+        df = cudf.DataFrame.copy(self.obj)
+        groupings = self.grouping.names
+
+        if subset is None:
+            subset = [i for i in df.columns if i not in groupings]
+
+        df["placeholder"] = 1
+        result = (
+            df.groupby(groupings + subset, dropna=dropna)
+            .placeholder.count()
+            .sort_index()
+            .astype(np.int64)
+        )
+
+        if normalize:
+            levels = list(range(len(groupings), result.index.nlevels))
+            result /= result.groupby(
+                result.index.droplevel(levels),
+            ).transform("sum")
+
+        if sort:
+            result = result.sort_values(ascending=ascending).sort_index(
+                level=range(len(groupings)), sort_remaining=False
+            )
+
+        result.name = "proportion" if normalize else "count"
+        if not self._as_index:
+            result = result.to_frame().reset_index()
+
+        return result
+
     def _mimic_pandas_order(
         self, result: DataFrameOrSeries
     ) -> DataFrameOrSeries:
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 042f0e1aa38..9311d3d04c4 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3473,3 +3473,48 @@ def test_categorical_grouping_pandas_compatibility():
     expected = pdf.groupby("key", sort=False).sum()
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index):
+    # From Issue#12789
+    df = cudf.DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", np.nan, "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    pdf = df.to_pandas()
+
+    actual = df.groupby("gender", as_index=as_index).value_counts(
+        normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+    )
+    expected = pdf.groupby("gender", as_index=as_index).value_counts(
+        normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+    )
+
+    # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
+    assert_eq(actual, expected, check_names=False)
+
+
+def test_group_by_value_counts_subset():
+    # From Issue#12789
+    df = cudf.DataFrame(
+        {
+            "gender": ["male", "male", "female", "male", "female", "male"],
+            "education": ["low", "medium", "high", "low", "high", "low"],
+            "country": ["US", "FR", "US", "FR", "FR", "FR"],
+        }
+    )
+    pdf = df.to_pandas()
+
+    actual = df.groupby("gender").value_counts(["education"])
+    expected = pdf.groupby("gender").value_counts(["education"])
+
+    # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
+    assert_eq(actual, expected, check_names=False)

From 7c912835d371edf09306dfb3aa5fa4741e0e528c Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Mon, 18 Sep 2023 10:56:16 +0000
Subject: [PATCH 2/5] Update tests for `GroupBy.value_counts`

---
 python/cudf/cudf/tests/test_groupby.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 9311d3d04c4..2d35c545bbe 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3499,7 +3499,9 @@ def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index):
     )
 
     # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
-    assert_eq(actual, expected, check_names=False)
+    assert_groupby_results_equal(
+        actual, expected, check_names=False, check_index_type=False
+    )
 
 
 def test_group_by_value_counts_subset():
@@ -3517,4 +3519,6 @@ def test_group_by_value_counts_subset():
     expected = pdf.groupby("gender").value_counts(["education"])
 
     # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0`
-    assert_eq(actual, expected, check_names=False)
+    assert_groupby_results_equal(
+        actual, expected, check_names=False, check_index_type=False
+    )

From 485ad868a29feb44346a8a030ea6dcd0cc95dc40 Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:53:18 +0100
Subject: [PATCH 3/5] Cover possible errors with `GroupBy.value_counts`

---
 python/cudf/cudf/core/groupby/groupby.py | 29 ++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6a6ad8cf57f..821bfceb5c4 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2450,14 +2450,29 @@ def value_counts(
 
         df = cudf.DataFrame.copy(self.obj)
         groupings = self.grouping.names
+        name = "proportion" if normalize else "count"
 
         if subset is None:
             subset = [i for i in df.columns if i not in groupings]
+        # Check subset exists in dataframe
+        elif set(subset) - set(df.columns):
+            raise ValueError(
+                f"Keys {set(subset) - set(df.columns)} in subset "
+                f"do not exist in the DataFrame."
+            )
+        # Catch case where groupby and subset share an element
+        elif set(subset) & set(groupings):
+            raise ValueError(
+                f"Keys {set(subset) & set(groupings)} in subset "
+                "cannot be in the groupby column keys."
+            )
 
-        df["placeholder"] = 1
+        df["__placeholder"] = 1
         result = (
-            df.groupby(groupings + subset, dropna=dropna)
-            .placeholder.count()
+            df.groupby(groupings + list(subset), dropna=dropna)[
+                "__placeholder"
+            ]
+            .count()
             .sort_index()
             .astype(np.int64)
         )
@@ -2473,9 +2488,15 @@ def value_counts(
                 level=range(len(groupings)), sort_remaining=False
             )
 
-        result.name = "proportion" if normalize else "count"
         if not self._as_index:
+            if name in df.columns:
+                raise ValueError(
+                    f"Column label '{name}' is duplicate of result column"
+                )
+            result.name = name
             result = result.to_frame().reset_index()
+        else:
+            result.name = name
 
         return result
 

From 42c345745aed5d81d8df91bfaf50ffc496d26030 Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Mon, 18 Sep 2023 20:22:01 +0100
Subject: [PATCH 4/5] Add tests for error cases

---
 python/cudf/cudf/tests/test_groupby.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 2d35c545bbe..376639d5226 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3522,3 +3522,21 @@ def test_group_by_value_counts_subset():
     assert_groupby_results_equal(
         actual, expected, check_names=False, check_index_type=False
     )
+
+
+def test_group_by_value_counts_clash_with_subset():
+    df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a").value_counts(["a"])
+
+
+def test_group_by_value_counts_subset_not_exists():
+    df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a").value_counts(["c"])
+
+
+def test_group_by_value_counts_with_count_column():
+    df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]})
+    with pytest.raises(ValueError):
+        df.groupby("a", as_index=False).value_counts()

From b01b566eae0bf40b1c0d44afb2a70735fb8a9475 Mon Sep 17 00:00:00 2001
From: Sam Turner <98767222+stmio@users.noreply.github.com>
Date: Tue, 19 Sep 2023 06:44:06 +0100
Subject: [PATCH 5/5] Use `._column_names` rather than `.columns`

---
 python/cudf/cudf/core/groupby/groupby.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 821bfceb5c4..e1740140b44 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2453,11 +2453,11 @@ def value_counts(
         name = "proportion" if normalize else "count"
 
         if subset is None:
-            subset = [i for i in df.columns if i not in groupings]
+            subset = [i for i in df._column_names if i not in groupings]
         # Check subset exists in dataframe
-        elif set(subset) - set(df.columns):
+        elif set(subset) - set(df._column_names):
             raise ValueError(
-                f"Keys {set(subset) - set(df.columns)} in subset "
+                f"Keys {set(subset) - set(df._column_names)} in subset "
                 f"do not exist in the DataFrame."
             )
         # Catch case where groupby and subset share an element
@@ -2489,7 +2489,7 @@ def value_counts(
             )
 
         if not self._as_index:
-            if name in df.columns:
+            if name in df._column_names:
                 raise ValueError(
                     f"Column label '{name}' is duplicate of result column"
                 )