[ENH]row_to_names for polars (#1363)

Added `row_to_names` for polars DataFrames.
pyjanitor-devs · Jun 11, 2024 · 5c281b2 · 5c281b2
1 parent 2dff4f6
commit 5c281b2
Show file tree

Hide file tree

Showing 12 changed files with 1,236 additions and 748 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # Changelog
 
 ## [Unreleased]
+-  [ENH] Added a `row_to_names` method for polars. Issue #1352
 -  [ENH] `read_commandline` function now supports polars - Issue #1352
 
 - [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341

diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py
@@ -1,5 +1,7 @@
 """Implementation of the `row_to_names` function."""
 
+from __future__ import annotations
+
 import warnings
 
 import numpy as np
@@ -13,7 +15,7 @@
 @deprecated_alias(row_number="row_numbers", remove_row="remove_rows")
 def row_to_names(
     df: pd.DataFrame,
-    row_numbers: int = 0,
+    row_numbers: int | list = 0,
     remove_rows: bool = False,
     remove_rows_above: bool = False,
     reset_index: bool = False,
@@ -73,7 +75,7 @@ def row_to_names(
             Note that indexing starts from 0. It can also be a list,
             in which case, a MultiIndex column is created.
             Defaults to 0 (first row).
-        remove_row: Whether the row(s) should be removed from the DataFrame.
+        remove_rows: Whether the row(s) should be removed from the DataFrame.
         remove_rows_above: Whether the row(s) above the selected row should
             be removed from the DataFrame.
         reset_index: Whether the index should be reset on the returning DataFrame.
@@ -84,10 +86,10 @@ def row_to_names(
     if not pd.options.mode.copy_on_write:
         df = df.copy()
 
-    check("row_number", row_numbers, [int, list])
+    check("row_numbers", row_numbers, [int, list])
     if isinstance(row_numbers, list):
         for entry in row_numbers:
-            check("entry in the row_number argument", entry, [int])
+            check("entry in the row_numbers argument", entry, [int])
 
     warnings.warn(
         "The function row_to_names will, in the official 1.0 release, "

diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py
diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py
@@ -115,11 +115,11 @@ def _strip_underscores_func_expr(
 
 def _clean_column_names(
     obj: str,
-    strip_underscores: str | bool = None,
-    case_type: str = "lower",
-    remove_special: bool = False,
-    strip_accents: bool = False,
-    truncate_limit: int = None,
+    strip_underscores: str | bool,
+    case_type: str,
+    remove_special: bool,
+    strip_accents: bool,
+    truncate_limit: int,
 ) -> str:
     """
     Function to clean the column names of a polars DataFrame.

diff --git a/janitor/polars/dataframe.py b/janitor/polars/dataframe.py
diff --git a/janitor/polars/expressions.py b/janitor/polars/expressions.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+from janitor.utils import import_message
+
+from .clean_names import _clean_expr_names
+
+try:
+    import polars as pl
+except ImportError:
+    import_message(
+        submodule="polars",
+        package="polars",
+        conda_channel="conda-forge",
+        pip_install=True,
+    )
+
+
+@pl.api.register_expr_namespace("janitor")
+class PolarsExpr:
+    def __init__(self, expr: pl.Expr) -> pl.Expr:
+        self._expr = expr
+
+    def clean_names(
+        self,
+        strip_underscores: str | bool = None,
+        case_type: str = "lower",
+        remove_special: bool = False,
+        strip_accents: bool = False,
+        enforce_string: bool = False,
+        truncate_limit: int = None,
+    ) -> pl.Expr:
+        """
+        Clean the labels in a polars Expression.
+
+        Examples:
+            >>> import polars as pl
+            >>> import janitor.polars
+            >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
+            >>> df
+            shape: (1, 1)
+            ┌─────────────┐
+            │ raw         │
+            │ ---         │
+            │ str         │
+            ╞═════════════╡
+            │ Abçdê fgí j │
+            └─────────────┘
+
+            Clean the column values:
+            >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True))
+            shape: (1, 1)
+            ┌─────────────┐
+            │ raw         │
+            │ ---         │
+            │ str         │
+            ╞═════════════╡
+            │ abcde_fgi_j │
+            └─────────────┘
+
+        !!! info "New in version 0.28.0"
+
+        Args:
+            strip_underscores: Removes the outer underscores
+                from all labels in the expression.
+                Default None keeps outer underscores.
+                Values can be either 'left', 'right'
+                or 'both' or the respective shorthand 'l',
+                'r' and True.
+            case_type: Whether to make the labels in the expression lower or uppercase.
+                Current case may be preserved with 'preserve',
+                while snake case conversion (from CamelCase or camelCase only)
+                can be turned on using "snake".
+                Default 'lower' makes all characters lowercase.
+            remove_special: Remove special characters from the values in the expression.
+                Only letters, numbers and underscores are preserved.
+            strip_accents: Whether or not to remove accents from
+                the expression.
+            enforce_string: Whether or not to cast the expression to a string type.
+            truncate_limit: Truncates formatted labels in the expression to
+                the specified length. Default None does not truncate.
+
+        Returns:
+            A polars Expression.
+        """
+        return _clean_expr_names(
+            obj=self._expr,
+            strip_accents=strip_accents,
+            strip_underscores=strip_underscores,
+            case_type=case_type,
+            remove_special=remove_special,
+            enforce_string=enforce_string,
+            truncate_limit=truncate_limit,
+        )