Skip to content

Commit

Permalink
[ENH]row_to_names for polars (#1363)
Browse files Browse the repository at this point in the history
Added `row_to_names` for polars DataFrames.
  • Loading branch information
samukweku authored Jun 11, 2024
1 parent 2dff4f6 commit 5c281b2
Show file tree
Hide file tree
Showing 12 changed files with 1,236 additions and 748 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## [Unreleased]
- [ENH] Added a `row_to_names` method for polars. Issue #1352
- [ENH] `read_commandline` function now supports polars - Issue #1352

- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341
Expand Down
10 changes: 6 additions & 4 deletions janitor/functions/row_to_names.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Implementation of the `row_to_names` function."""

from __future__ import annotations

import warnings

import numpy as np
Expand All @@ -13,7 +15,7 @@
@deprecated_alias(row_number="row_numbers", remove_row="remove_rows")
def row_to_names(
df: pd.DataFrame,
row_numbers: int = 0,
row_numbers: int | list = 0,
remove_rows: bool = False,
remove_rows_above: bool = False,
reset_index: bool = False,
Expand Down Expand Up @@ -73,7 +75,7 @@ def row_to_names(
Note that indexing starts from 0. It can also be a list,
in which case, a MultiIndex column is created.
Defaults to 0 (first row).
remove_row: Whether the row(s) should be removed from the DataFrame.
remove_rows: Whether the row(s) should be removed from the DataFrame.
remove_rows_above: Whether the row(s) above the selected row should
be removed from the DataFrame.
reset_index: Whether the index should be reset on the returning DataFrame.
Expand All @@ -84,10 +86,10 @@ def row_to_names(
if not pd.options.mode.copy_on_write:
df = df.copy()

check("row_number", row_numbers, [int, list])
check("row_numbers", row_numbers, [int, list])
if isinstance(row_numbers, list):
for entry in row_numbers:
check("entry in the row_number argument", entry, [int])
check("entry in the row_numbers argument", entry, [int])

warnings.warn(
"The function row_to_names will, in the official 1.0 release, "
Expand Down
748 changes: 12 additions & 736 deletions janitor/polars/__init__.py

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions janitor/polars/clean_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ def _strip_underscores_func_expr(

def _clean_column_names(
obj: str,
strip_underscores: str | bool = None,
case_type: str = "lower",
remove_special: bool = False,
strip_accents: bool = False,
truncate_limit: int = None,
strip_underscores: str | bool,
case_type: str,
remove_special: bool,
strip_accents: bool,
truncate_limit: int,
) -> str:
"""
Function to clean the column names of a polars DataFrame.
Expand Down
434 changes: 434 additions & 0 deletions janitor/polars/dataframe.py

Large diffs are not rendered by default.

93 changes: 93 additions & 0 deletions janitor/polars/expressions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import annotations

from janitor.utils import import_message

from .clean_names import _clean_expr_names

try:
import polars as pl
except ImportError:
import_message(
submodule="polars",
package="polars",
conda_channel="conda-forge",
pip_install=True,
)


@pl.api.register_expr_namespace("janitor")
class PolarsExpr:
def __init__(self, expr: pl.Expr) -> pl.Expr:
self._expr = expr

def clean_names(
self,
strip_underscores: str | bool = None,
case_type: str = "lower",
remove_special: bool = False,
strip_accents: bool = False,
enforce_string: bool = False,
truncate_limit: int = None,
) -> pl.Expr:
"""
Clean the labels in a polars Expression.
Examples:
>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
>>> df
shape: (1, 1)
┌─────────────┐
│ raw │
│ --- │
│ str │
╞═════════════╡
│ Abçdê fgí j │
└─────────────┘
Clean the column values:
>>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True))
shape: (1, 1)
┌─────────────┐
│ raw │
│ --- │
│ str │
╞═════════════╡
│ abcde_fgi_j │
└─────────────┘
!!! info "New in version 0.28.0"
Args:
strip_underscores: Removes the outer underscores
from all labels in the expression.
Default None keeps outer underscores.
Values can be either 'left', 'right'
or 'both' or the respective shorthand 'l',
'r' and True.
case_type: Whether to make the labels in the expression lower or uppercase.
Current case may be preserved with 'preserve',
while snake case conversion (from CamelCase or camelCase only)
can be turned on using "snake".
Default 'lower' makes all characters lowercase.
remove_special: Remove special characters from the values in the expression.
Only letters, numbers and underscores are preserved.
strip_accents: Whether or not to remove accents from
the expression.
enforce_string: Whether or not to cast the expression to a string type.
truncate_limit: Truncates formatted labels in the expression to
the specified length. Default None does not truncate.
Returns:
A polars Expression.
"""
return _clean_expr_names(
obj=self._expr,
strip_accents=strip_accents,
strip_underscores=strip_underscores,
case_type=case_type,
remove_special=remove_special,
enforce_string=enforce_string,
truncate_limit=truncate_limit,
)
Loading

0 comments on commit 5c281b2

Please sign in to comment.