From 5c281b252374a001cb140bd71e2cf7942f1f5114 Mon Sep 17 00:00:00 2001 From: Samuel Oranyeli Date: Tue, 11 Jun 2024 10:46:31 +1000 Subject: [PATCH] [ENH]`row_to_names` for polars (#1363) Added `row_to_names` for polars DataFrames. --- CHANGELOG.md | 1 + janitor/functions/row_to_names.py | 10 +- janitor/polars/__init__.py | 748 +----------------- janitor/polars/clean_names.py | 10 +- janitor/polars/dataframe.py | 434 ++++++++++ janitor/polars/expressions.py | 93 +++ janitor/polars/lazyframe.py | 314 ++++++++ janitor/polars/pivot_longer.py | 131 +++ janitor/polars/row_to_names.py | 74 ++ mkdocs.yml | 2 + mkdocs/api/polars.md | 7 +- .../functions/test_row_to_names_polars.py | 160 ++++ 12 files changed, 1236 insertions(+), 748 deletions(-) create mode 100644 janitor/polars/dataframe.py create mode 100644 janitor/polars/expressions.py create mode 100644 janitor/polars/lazyframe.py create mode 100644 janitor/polars/row_to_names.py create mode 100644 tests/polars/functions/test_row_to_names_polars.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f4783253..a7c5e6869 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## [Unreleased] +- [ENH] Added a `row_to_names` method for polars. Issue #1352 - [ENH] `read_commandline` function now supports polars - Issue #1352 - [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341 diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index a07208f9a..2f529c8b5 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -1,5 +1,7 @@ """Implementation of the `row_to_names` function.""" +from __future__ import annotations + import warnings import numpy as np @@ -13,7 +15,7 @@ @deprecated_alias(row_number="row_numbers", remove_row="remove_rows") def row_to_names( df: pd.DataFrame, - row_numbers: int = 0, + row_numbers: int | list = 0, remove_rows: bool = False, remove_rows_above: bool = False, reset_index: bool = False, @@ -73,7 +75,7 @@ def row_to_names( Note that indexing starts from 0. It can also be a list, in which case, a MultiIndex column is created. Defaults to 0 (first row). - remove_row: Whether the row(s) should be removed from the DataFrame. + remove_rows: Whether the row(s) should be removed from the DataFrame. remove_rows_above: Whether the row(s) above the selected row should be removed from the DataFrame. reset_index: Whether the index should be reset on the returning DataFrame. @@ -84,10 +86,10 @@ def row_to_names( if not pd.options.mode.copy_on_write: df = df.copy() - check("row_number", row_numbers, [int, list]) + check("row_numbers", row_numbers, [int, list]) if isinstance(row_numbers, list): for entry in row_numbers: - check("entry in the row_number argument", entry, [int]) + check("entry in the row_numbers argument", entry, [int]) warnings.warn( "The function row_to_names will, in the official 1.0 release, " diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index f69080207..b16be7a7a 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,736 +1,12 @@ -from __future__ import annotations - -from polars.type_aliases import ColumnNameOrSelector - -from janitor.utils import check, import_message - -from .clean_names import _clean_column_names, _clean_expr_names -from .pivot_longer import _pivot_longer, _pivot_longer_dot_value - -try: - import polars as pl -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -@pl.api.register_dataframe_namespace("janitor") -class PolarsFrame: - def __init__(self, df: pl.DataFrame) -> pl.DataFrame: - self._df = df - - def clean_names( - self, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, - ) -> pl.DataFrame: - """ - Clean the column names in a polars DataFrame. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame( - ... { - ... "Aloha": range(3), - ... "Bell Chart": range(3), - ... "Animals@#$%^": range(3) - ... } - ... ) - >>> df - shape: (3, 3) - ┌───────┬────────────┬──────────────┐ - │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪══════════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴──────────────┘ - >>> df.janitor.clean_names(remove_special=True) - shape: (3, 3) - ┌───────┬────────────┬─────────┐ - │ aloha ┆ bell_chart ┆ animals │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪═════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴─────────┘ - - !!! info "New in version 0.28.0" - - Args: - strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the column names lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the column names. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the labels. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. - - Returns: - A polars DataFrame. - """ # noqa: E501 - return self._df.rename( - lambda col: _clean_column_names( - obj=col, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - truncate_limit=truncate_limit, - ) - ) - - def pivot_longer( - self, - index: ColumnNameOrSelector = None, - column_names: ColumnNameOrSelector = None, - names_to: list | tuple | str = "variable", - values_to: str = "value", - names_sep: str = None, - names_pattern: str = None, - names_transform: pl.Expr = None, - ) -> pl.DataFrame: - """ - Unpivots a DataFrame from *wide* to *long* format. - - It is modeled after the `pivot_longer` function in R's tidyr package, - and also takes inspiration from the `melt` function in R's data.table package. - - This function is useful to massage a DataFrame into a format where - one or more columns are considered measured variables, and all other - columns are considered as identifier variables. - - All measured variables are *unpivoted* (and typically duplicated) along the - row axis. - - For more granular control on the unpivoting, have a look at - `pivot_longer_spec`. - - Examples: - >>> import polars as pl - >>> import polars.selectors as cs - >>> import janitor.polars - >>> df = pl.DataFrame( - ... { - ... "Sepal.Length": [5.1, 5.9], - ... "Sepal.Width": [3.5, 3.0], - ... "Petal.Length": [1.4, 5.1], - ... "Petal.Width": [0.2, 1.8], - ... "Species": ["setosa", "virginica"], - ... } - ... ) - >>> df - shape: (2, 5) - ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ - │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ - ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ - │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ - │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ - └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - - Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): - >>> df.janitor.pivot_longer(index = 'Species') - shape: (8, 3) - ┌───────────┬──────────────┬───────┐ - │ Species ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞═══════════╪══════════════╪═══════╡ - │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ - │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ - │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ - │ virginica ┆ Petal.Width ┆ 1.8 │ - └───────────┴──────────────┴───────┘ - - Split the column labels into individual columns: - >>> df.janitor.pivot_longer( - ... index = 'Species', - ... names_to = ('part', 'dimension'), - ... names_sep = '.', - ... ).select('Species','part','dimension','value') - shape: (8, 4) - ┌───────────┬───────┬───────────┬───────┐ - │ Species ┆ part ┆ dimension ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ f64 │ - ╞═══════════╪═══════╪═══════════╪═══════╡ - │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ - │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ - │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ - │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ - │ setosa ┆ Petal ┆ Length ┆ 1.4 │ - │ virginica ┆ Petal ┆ Length ┆ 5.1 │ - │ setosa ┆ Petal ┆ Width ┆ 0.2 │ - │ virginica ┆ Petal ┆ Width ┆ 1.8 │ - └───────────┴───────┴───────────┴───────┘ - - Retain parts of the column names as headers: - >>> df.janitor.pivot_longer( - ... index = 'Species', - ... names_to = ('part', '.value'), - ... names_sep = '.', - ... ).select('Species','part','Length','Width') - shape: (4, 4) - ┌───────────┬───────┬────────┬───────┐ - │ Species ┆ part ┆ Length ┆ Width │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 ┆ f64 │ - ╞═══════════╪═══════╪════════╪═══════╡ - │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ - │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ - │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ - │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ - └───────────┴───────┴────────┴───────┘ - - Split the column labels based on regex: - >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]}) - >>> df - shape: (1, 3) - ┌─────┬──────────────┬────────────┐ - │ id ┆ new_sp_m5564 ┆ newrel_f65 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════════╪════════════╡ - │ 1 ┆ 2 ┆ 3 │ - └─────┴──────────────┴────────────┘ - >>> df.janitor.pivot_longer( - ... index = 'id', - ... names_to = ('diagnosis', 'gender', 'age'), - ... names_pattern = r"new_?(.+)_(.)([0-9]+)", - ... ).select('id','diagnosis','gender','age','value').sort(by=pl.all()) - shape: (2, 5) - ┌─────┬───────────┬────────┬──────┬───────┐ - │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str ┆ str ┆ i64 │ - ╞═════╪═══════════╪════════╪══════╪═══════╡ - │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ - │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ - └─────┴───────────┴────────┴──────┴───────┘ - - Convert the dtypes of specific columns with `names_transform`: - >>> df.janitor.pivot_longer( - ... index = "id", - ... names_pattern=r"new_?(.+)_(.)([0-9]+)", - ... names_to=("diagnosis", "gender", "age"), - ... names_transform=pl.col('age').cast(pl.Int32), - ... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all()) - shape: (2, 5) - ┌─────┬───────────┬────────┬──────┬───────┐ - │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str ┆ i32 ┆ i64 │ - ╞═════╪═══════════╪════════╪══════╪═══════╡ - │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ - │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ - └─────┴───────────┴────────┴──────┴───────┘ - - Use multiple `.value` to reshape the dataframe: - >>> df = pl.DataFrame( - ... [ - ... { - ... "x_1_mean": 10, - ... "x_2_mean": 20, - ... "y_1_mean": 30, - ... "y_2_mean": 40, - ... "unit": 50, - ... } - ... ] - ... ) - >>> df - shape: (1, 5) - ┌──────────┬──────────┬──────────┬──────────┬──────┐ - │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞══════════╪══════════╪══════════╪══════════╪══════╡ - │ 10 ┆ 20 ┆ 30 ┆ 40 ┆ 50 │ - └──────────┴──────────┴──────────┴──────────┴──────┘ - >>> df.janitor.pivot_longer( - ... index="unit", - ... names_to=(".value", "time", ".value"), - ... names_pattern=r"(x|y)_([0-9])(_mean)", - ... ).select('unit','time','x_mean','y_mean').sort(by=pl.all()) - shape: (2, 4) - ┌──────┬──────┬────────┬────────┐ - │ unit ┆ time ┆ x_mean ┆ y_mean │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ i64 │ - ╞══════╪══════╪════════╪════════╡ - │ 50 ┆ 1 ┆ 10 ┆ 30 │ - │ 50 ┆ 2 ┆ 20 ┆ 40 │ - └──────┴──────┴────────┴────────┘ - - !!! info "New in version 0.28.0" - - Args: - index: Column(s) or selector(s) to use as identifier variables. - column_names: Column(s) or selector(s) to unpivot. - names_to: Name of new column as a string that will contain - what were previously the column names in `column_names`. - The default is `variable` if no value is provided. It can - also be a list/tuple of strings that will serve as new column - names, if `name_sep` or `names_pattern` is provided. - If `.value` is in `names_to`, new column names will be extracted - from part of the existing column names and overrides `values_to`. - values_to: Name of new column as a string that will contain what - were previously the values of the columns in `column_names`. - names_sep: Determines how the column name is broken up, if - `names_to` contains multiple values. It takes the same - specification as polars' `str.split` method. - names_pattern: Determines how the column name is broken up. - It can be a regular expression containing matching groups. - It takes the same - specification as polars' `str.extract_groups` method. - names_transform: Use this option to change the types of columns that - have been transformed to rows. - This does not applies to the values' columns. - Accepts a polars expression or a list of polars expressions. - Applicable only if one of names_sep - or names_pattern is provided. - - Returns: - A polars DataFrame that has been unpivoted from wide to long - format. - """ # noqa: E501 - return _pivot_longer( - df=self._df, - index=index, - column_names=column_names, - names_pattern=names_pattern, - names_sep=names_sep, - names_to=names_to, - values_to=values_to, - names_transform=names_transform, - ) - - -@pl.api.register_lazyframe_namespace("janitor") -class PolarsLazyFrame: - def __init__(self, df: pl.LazyFrame) -> pl.LazyFrame: - self._df = df - - def clean_names( - self, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, - ) -> pl.LazyFrame: - """ - Clean the column names in a polars LazyFrame. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.LazyFrame( - ... { - ... "Aloha": range(3), - ... "Bell Chart": range(3), - ... "Animals@#$%^": range(3) - ... } - ... ) - >>> df.collect() - shape: (3, 3) - ┌───────┬────────────┬──────────────┐ - │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪══════════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴──────────────┘ - >>> df.janitor.clean_names(remove_special=True).collect() - shape: (3, 3) - ┌───────┬────────────┬─────────┐ - │ aloha ┆ bell_chart ┆ animals │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪═════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴─────────┘ - - !!! info "New in version 0.28.0" - - Args: - strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the column names lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the column names. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the labels. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. - - Returns: - A polars LazyFrame. - """ # noqa: E501 - return self._df.rename( - lambda col: _clean_column_names( - obj=col, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - truncate_limit=truncate_limit, - ) - ) - - def pivot_longer( - self, - index: ColumnNameOrSelector = None, - column_names: ColumnNameOrSelector = None, - names_to: list | tuple | str = "variable", - values_to: str = "value", - names_sep: str = None, - names_pattern: str = None, - names_transform: pl.Expr = None, - ) -> pl.LazyFrame: - """ - Unpivots a LazyFrame from *wide* to *long* format. - - It is modeled after the `pivot_longer` function in R's tidyr package, - and also takes inspiration from the `melt` function in R's data.table package. - - This function is useful to massage a LazyFrame into a format where - one or more columns are considered measured variables, and all other - columns are considered as identifier variables. - - All measured variables are *unpivoted* (and typically duplicated) along the - row axis. - - For more granular control on the unpivoting, have a look at - `pivot_longer_spec`. - - Examples: - >>> import polars as pl - >>> import polars.selectors as cs - >>> import janitor.polars - >>> df = pl.LazyFrame( - ... { - ... "Sepal.Length": [5.1, 5.9], - ... "Sepal.Width": [3.5, 3.0], - ... "Petal.Length": [1.4, 5.1], - ... "Petal.Width": [0.2, 1.8], - ... "Species": ["setosa", "virginica"], - ... } - ... ) - >>> df.collect() - shape: (2, 5) - ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ - │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ - ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ - │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ - │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ - └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - - >>> df.janitor.pivot_longer(index = 'Species').collect() - shape: (8, 3) - ┌───────────┬──────────────┬───────┐ - │ Species ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞═══════════╪══════════════╪═══════╡ - │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ - │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ - │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ - │ virginica ┆ Petal.Width ┆ 1.8 │ - └───────────┴──────────────┴───────┘ - - !!! info "New in version 0.28.0" - - Args: - index: Column(s) or selector(s) to use as identifier variables. - column_names: Column(s) or selector(s) to unpivot. - names_to: Name of new column as a string that will contain - what were previously the column names in `column_names`. - The default is `variable` if no value is provided. It can - also be a list/tuple of strings that will serve as new column - names, if `name_sep` or `names_pattern` is provided. - If `.value` is in `names_to`, new column names will be extracted - from part of the existing column names and overrides `values_to`. - values_to: Name of new column as a string that will contain what - were previously the values of the columns in `column_names`. - names_sep: Determines how the column name is broken up, if - `names_to` contains multiple values. It takes the same - specification as polars' `str.split` method. - names_pattern: Determines how the column name is broken up. - It can be a regular expression containing matching groups. - It takes the same - specification as polars' `str.extract_groups` method. - names_transform: Use this option to change the types of columns that - have been transformed to rows. - This does not applies to the values' columns. - Accepts a polars expression or a list of polars expressions. - Applicable only if one of names_sep - or names_pattern is provided. - - Returns: - A polars LazyFrame that has been unpivoted from wide to long - format. - """ # noqa: E501 - return _pivot_longer( - df=self._df, - index=index, - column_names=column_names, - names_pattern=names_pattern, - names_sep=names_sep, - names_to=names_to, - values_to=values_to, - names_transform=names_transform, - ) - - -@pl.api.register_expr_namespace("janitor") -class PolarsExpr: - def __init__(self, expr: pl.Expr) -> pl.Expr: - self._expr = expr - - def clean_names( - self, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - enforce_string: bool = False, - truncate_limit: int = None, - ) -> pl.Expr: - """ - Clean the labels in a polars Expression. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) - >>> df - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ Abçdê fgí j │ - └─────────────┘ - - Clean the column values: - >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True)) - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ abcde_fgi_j │ - └─────────────┘ - - !!! info "New in version 0.28.0" - - Args: - strip_underscores: Removes the outer underscores - from all labels in the expression. - Default None keeps outer underscores. - Values can be either 'left', 'right' - or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the labels in the expression lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the values in the expression. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the expression. - enforce_string: Whether or not to cast the expression to a string type. - truncate_limit: Truncates formatted labels in the expression to - the specified length. Default None does not truncate. - - Returns: - A polars Expression. - """ - return _clean_expr_names( - obj=self._expr, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - enforce_string=enforce_string, - truncate_limit=truncate_limit, - ) - - -def pivot_longer_spec( - df: pl.DataFrame | pl.LazyFrame, - spec: pl.DataFrame, -) -> pl.DataFrame | pl.LazyFrame: - """ - A declarative interface to pivot a DataFrame - from wide to long form, - where you describe how the data will be unpivoted, - using a DataFrame. This gives you, the user, - more control over the transformation to long form, - using a *spec* DataFrame that describes exactly - how data stored in the column names - becomes variables. - - It can come in handy for situations where - `janitor.polars.pivot_longer` - seems inadequate for the transformation. - - !!! info "New in version 0.28.0" - - Examples: - >>> import pandas as pd - >>> import janitor.polars - >>> df = pl.DataFrame( - ... { - ... "Sepal.Length": [5.1, 5.9], - ... "Sepal.Width": [3.5, 3.0], - ... "Petal.Length": [1.4, 5.1], - ... "Petal.Width": [0.2, 1.8], - ... "Species": ["setosa", "virginica"], - ... } - ... ) - >>> df - shape: (2, 5) - ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ - │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ - ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ - │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ - │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ - └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - >>> spec = {'.name':['Sepal.Length','Petal.Length', - ... 'Sepal.Width','Petal.Width'], - ... '.value':['Length','Length','Width','Width'], - ... 'part':['Sepal','Petal','Sepal','Petal']} - >>> spec = pl.DataFrame(spec) - >>> spec - shape: (4, 3) - ┌──────────────┬────────┬───────┐ - │ .name ┆ .value ┆ part │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str │ - ╞══════════════╪════════╪═══════╡ - │ Sepal.Length ┆ Length ┆ Sepal │ - │ Petal.Length ┆ Length ┆ Petal │ - │ Sepal.Width ┆ Width ┆ Sepal │ - │ Petal.Width ┆ Width ┆ Petal │ - └──────────────┴────────┴───────┘ - >>> df.pipe(pivot_longer_spec,spec=spec) - shape: (4, 4) - ┌───────────┬────────┬───────┬───────┐ - │ Species ┆ Length ┆ Width ┆ part │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ str │ - ╞═══════════╪════════╪═══════╪═══════╡ - │ setosa ┆ 5.1 ┆ 3.5 ┆ Sepal │ - │ virginica ┆ 5.9 ┆ 3.0 ┆ Sepal │ - │ setosa ┆ 1.4 ┆ 0.2 ┆ Petal │ - │ virginica ┆ 5.1 ┆ 1.8 ┆ Petal │ - └───────────┴────────┴───────┴───────┘ - - Args: - df: The source DataFrame to unpivot. - spec: A specification DataFrame. - At a minimum, the spec DataFrame - must have a `.name` column - and a `.value` column. - The `.name` column should contain the - columns in the source DataFrame that will be - transformed to long form. - The `.value` column gives the name of the column - that the values in the source DataFrame will go into. - Additional columns in the spec DataFrame - should be named to match columns - in the long format of the dataset and contain values - corresponding to columns pivoted from the wide format. - Note that these additional columns should not already exist - in the source DataFrame. - - Raises: - KeyError: If `.name` or `.value` is missing from the spec's columns. - ValueError: If the labels in `spec['.name']` is not unique. - - Returns: - A polars DataFrame/LazyFrame. - """ - check("spec", spec, [pl.DataFrame]) - if ".name" not in spec.columns: - raise KeyError( - "Kindly ensure the spec DataFrame has a `.name` column." - ) - if ".value" not in spec.columns: - raise KeyError( - "Kindly ensure the spec DataFrame has a `.value` column." - ) - if spec.select(pl.col(".name").is_duplicated().any()).item(): - raise ValueError("The labels in the `.name` column should be unique.") - - exclude = set(df.columns).intersection(spec.columns) - if exclude: - raise ValueError( - f"Labels {*exclude, } in the spec dataframe already exist " - "as column labels in the source dataframe. " - "Kindly ensure the spec DataFrame's columns " - "are not present in the source DataFrame." - ) - - if spec.columns[:2] != [".name", ".value"]: - raise ValueError( - "The first two columns of the spec DataFrame " - "should be '.name' and '.value', " - "with '.name' coming before '.value'." - ) - - return _pivot_longer_dot_value( - df=df, - spec=spec, - ) - - -__all__ = ["PolarsFrame", "PolarsLazyFrame", "pivot_longer_spec"] +from .dataframe import PolarsDataFrame +from .expressions import PolarsExpr +from .lazyframe import PolarsLazyFrame +from .pivot_longer import pivot_longer_spec + +__all__ = [ + "pivot_longer_spec", + "clean_names", + "PolarsDataFrame", + "PolarsLazyFrame", + "PolarsExpr", +] diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py index 90e2656e2..5cb28e5f9 100644 --- a/janitor/polars/clean_names.py +++ b/janitor/polars/clean_names.py @@ -115,11 +115,11 @@ def _strip_underscores_func_expr( def _clean_column_names( obj: str, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, + strip_underscores: str | bool, + case_type: str, + remove_special: bool, + strip_accents: bool, + truncate_limit: int, ) -> str: """ Function to clean the column names of a polars DataFrame. diff --git a/janitor/polars/dataframe.py b/janitor/polars/dataframe.py new file mode 100644 index 000000000..4a4016c9c --- /dev/null +++ b/janitor/polars/dataframe.py @@ -0,0 +1,434 @@ +from __future__ import annotations + +from polars.type_aliases import ColumnNameOrSelector + +from janitor.utils import import_message + +from .clean_names import _clean_column_names +from .pivot_longer import _pivot_longer +from .row_to_names import _row_to_names + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@pl.api.register_dataframe_namespace("janitor") +class PolarsDataFrame: + def __init__(self, df: pl.DataFrame) -> pl.DataFrame: + self._df = df + + def clean_names( + self, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + truncate_limit: int = None, + ) -> pl.DataFrame: + """ + Clean the column names in a polars DataFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + >>> df.janitor.clean_names(remove_special=True) + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores from all + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the column names lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the column names. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. + + Returns: + A polars DataFrame. + """ # noqa: E501 + return self._df.rename( + lambda col: _clean_column_names( + obj=col, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + truncate_limit=truncate_limit, + ) + ) + + def pivot_longer( + self, + index: ColumnNameOrSelector = None, + column_names: ColumnNameOrSelector = None, + names_to: list | tuple | str = "variable", + values_to: str = "value", + names_sep: str = None, + names_pattern: str = None, + names_transform: pl.Expr = None, + ) -> pl.DataFrame: + """ + Unpivots a DataFrame from *wide* to *long* format. + + It is modeled after the `pivot_longer` function in R's tidyr package, + and also takes inspiration from the `melt` function in R's data.table package. + + This function is useful to massage a DataFrame into a format where + one or more columns are considered measured variables, and all other + columns are considered as identifier variables. + + All measured variables are *unpivoted* (and typically duplicated) along the + row axis. + + For more granular control on the unpivoting, have a look at + `pivot_longer_spec`. + + Examples: + >>> import polars as pl + >>> import polars.selectors as cs + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Sepal.Length": [5.1, 5.9], + ... "Sepal.Width": [3.5, 3.0], + ... "Petal.Length": [1.4, 5.1], + ... "Petal.Width": [0.2, 1.8], + ... "Species": ["setosa", "virginica"], + ... } + ... ) + >>> df + shape: (2, 5) + ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ + │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ + ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ + │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ + │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ + └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ + + Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): + >>> df.janitor.pivot_longer(index = 'Species') + shape: (8, 3) + ┌───────────┬──────────────┬───────┐ + │ Species ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Sepal.Length ┆ 5.1 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ setosa ┆ Sepal.Width ┆ 3.5 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ virginica ┆ Petal.Length ┆ 5.1 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ + │ virginica ┆ Petal.Width ┆ 1.8 │ + └───────────┴──────────────┴───────┘ + + Split the column labels into individual columns: + >>> df.janitor.pivot_longer( + ... index = 'Species', + ... names_to = ('part', 'dimension'), + ... names_sep = '.', + ... ).select('Species','part','dimension','value') + shape: (8, 4) + ┌───────────┬───────┬───────────┬───────┐ + │ Species ┆ part ┆ dimension ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ f64 │ + ╞═══════════╪═══════╪═══════════╪═══════╡ + │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ + │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ + │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ + │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ + │ setosa ┆ Petal ┆ Length ┆ 1.4 │ + │ virginica ┆ Petal ┆ Length ┆ 5.1 │ + │ setosa ┆ Petal ┆ Width ┆ 0.2 │ + │ virginica ┆ Petal ┆ Width ┆ 1.8 │ + └───────────┴───────┴───────────┴───────┘ + + Retain parts of the column names as headers: + >>> df.janitor.pivot_longer( + ... index = 'Species', + ... names_to = ('part', '.value'), + ... names_sep = '.', + ... ).select('Species','part','Length','Width') + shape: (4, 4) + ┌───────────┬───────┬────────┬───────┐ + │ Species ┆ part ┆ Length ┆ Width │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 ┆ f64 │ + ╞═══════════╪═══════╪════════╪═══════╡ + │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ + │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ + │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ + └───────────┴───────┴────────┴───────┘ + + Split the column labels based on regex: + >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]}) + >>> df + shape: (1, 3) + ┌─────┬──────────────┬────────────┐ + │ id ┆ new_sp_m5564 ┆ newrel_f65 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════════╪════════════╡ + │ 1 ┆ 2 ┆ 3 │ + └─────┴──────────────┴────────────┘ + >>> df.janitor.pivot_longer( + ... index = 'id', + ... names_to = ('diagnosis', 'gender', 'age'), + ... names_pattern = r"new_?(.+)_(.)([0-9]+)", + ... ).select('id','diagnosis','gender','age','value').sort(by=pl.all()) + shape: (2, 5) + ┌─────┬───────────┬────────┬──────┬───────┐ + │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str ┆ str ┆ i64 │ + ╞═════╪═══════════╪════════╪══════╪═══════╡ + │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ + │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ + └─────┴───────────┴────────┴──────┴───────┘ + + Convert the dtypes of specific columns with `names_transform`: + >>> df.janitor.pivot_longer( + ... index = "id", + ... names_pattern=r"new_?(.+)_(.)([0-9]+)", + ... names_to=("diagnosis", "gender", "age"), + ... names_transform=pl.col('age').cast(pl.Int32), + ... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all()) + shape: (2, 5) + ┌─────┬───────────┬────────┬──────┬───────┐ + │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str ┆ i32 ┆ i64 │ + ╞═════╪═══════════╪════════╪══════╪═══════╡ + │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ + │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ + └─────┴───────────┴────────┴──────┴───────┘ + + Use multiple `.value` to reshape the dataframe: + >>> df = pl.DataFrame( + ... [ + ... { + ... "x_1_mean": 10, + ... "x_2_mean": 20, + ... "y_1_mean": 30, + ... "y_2_mean": 40, + ... "unit": 50, + ... } + ... ] + ... ) + >>> df + shape: (1, 5) + ┌──────────┬──────────┬──────────┬──────────┬──────┐ + │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞══════════╪══════════╪══════════╪══════════╪══════╡ + │ 10 ┆ 20 ┆ 30 ┆ 40 ┆ 50 │ + └──────────┴──────────┴──────────┴──────────┴──────┘ + >>> df.janitor.pivot_longer( + ... index="unit", + ... names_to=(".value", "time", ".value"), + ... names_pattern=r"(x|y)_([0-9])(_mean)", + ... ).select('unit','time','x_mean','y_mean').sort(by=pl.all()) + shape: (2, 4) + ┌──────┬──────┬────────┬────────┐ + │ unit ┆ time ┆ x_mean ┆ y_mean │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ i64 │ + ╞══════╪══════╪════════╪════════╡ + │ 50 ┆ 1 ┆ 10 ┆ 30 │ + │ 50 ┆ 2 ┆ 20 ┆ 40 │ + └──────┴──────┴────────┴────────┘ + + !!! info "New in version 0.28.0" + + Args: + index: Column(s) or selector(s) to use as identifier variables. + column_names: Column(s) or selector(s) to unpivot. + names_to: Name of new column as a string that will contain + what were previously the column names in `column_names`. + The default is `variable` if no value is provided. It can + also be a list/tuple of strings that will serve as new column + names, if `name_sep` or `names_pattern` is provided. + If `.value` is in `names_to`, new column names will be extracted + from part of the existing column names and overrides `values_to`. + values_to: Name of new column as a string that will contain what + were previously the values of the columns in `column_names`. + names_sep: Determines how the column name is broken up, if + `names_to` contains multiple values. It takes the same + specification as polars' `str.split` method. + names_pattern: Determines how the column name is broken up. + It can be a regular expression containing matching groups. + It takes the same + specification as polars' `str.extract_groups` method. + names_transform: Use this option to change the types of columns that + have been transformed to rows. + This does not applies to the values' columns. + Accepts a polars expression or a list of polars expressions. + Applicable only if one of names_sep + or names_pattern is provided. + + Returns: + A polars DataFrame that has been unpivoted from wide to long + format. + """ # noqa: E501 + return _pivot_longer( + df=self._df, + index=index, + column_names=column_names, + names_pattern=names_pattern, + names_sep=names_sep, + names_to=names_to, + values_to=values_to, + names_transform=names_transform, + ) + + def row_to_names( + self, + row_numbers: int | list = 0, + remove_rows: bool = False, + remove_rows_above: bool = False, + separator: str = "_", + ) -> pl.DataFrame: + """ + Elevates a row, or rows, to be the column names of a DataFrame. + + Examples: + Replace column names with the first row. + + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({ + ... "a": ["nums", '6', '9'], + ... "b": ["chars", "x", "y"], + ... }) + >>> df + shape: (3, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ nums ┆ chars │ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.janitor.row_to_names(0, remove_rows=True) + shape: (2, 2) + ┌──────┬───────┐ + │ nums ┆ chars │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.janitor.row_to_names(row_numbers=[0,1], remove_rows=True) + shape: (1, 2) + ┌────────┬─────────┐ + │ nums_6 ┆ chars_x │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪═════════╡ + │ 9 ┆ y │ + └────────┴─────────┘ + + Remove rows above the elevated row and the elevated row itself. + + >>> df = pl.DataFrame({ + ... "a": ["bla1", "nums", '6', '9'], + ... "b": ["bla2", "chars", "x", "y"], + ... }) + >>> df + shape: (4, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ bla1 ┆ bla2 │ + │ nums ┆ chars │ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.janitor.row_to_names(1, remove_rows=True, remove_rows_above=True) + shape: (2, 2) + ┌──────┬───────┐ + │ nums ┆ chars │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + + !!! info "New in version 0.28.0" + + Args: + row_numbers: Position of the row(s) containing the variable names. + Note that indexing starts from 0. It can also be a list. + Defaults to 0 (first row). + remove_rows: Whether the row(s) should be removed from the DataFrame. + remove_rows_above: Whether the row(s) above the selected row should + be removed from the DataFrame. + separator: Combines the labels into a single string, + if row_numbers is a list of integers. Default is '_'. + + Returns: + A polars DataFrame. + """ # noqa: E501 + return _row_to_names( + self._df, + row_numbers=row_numbers, + remove_rows=remove_rows, + remove_rows_above=remove_rows_above, + separator=separator, + ) diff --git a/janitor/polars/expressions.py b/janitor/polars/expressions.py new file mode 100644 index 000000000..46f1706e2 --- /dev/null +++ b/janitor/polars/expressions.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from janitor.utils import import_message + +from .clean_names import _clean_expr_names + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@pl.api.register_expr_namespace("janitor") +class PolarsExpr: + def __init__(self, expr: pl.Expr) -> pl.Expr: + self._expr = expr + + def clean_names( + self, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, + ) -> pl.Expr: + """ + Clean the labels in a polars Expression. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + >>> df + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ Abçdê fgí j │ + └─────────────┘ + + Clean the column values: + >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True)) + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ abcde_fgi_j │ + └─────────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores + from all labels in the expression. + Default None keeps outer underscores. + Values can be either 'left', 'right' + or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the labels in the expression lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the values in the expression. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the expression. + enforce_string: Whether or not to cast the expression to a string type. + truncate_limit: Truncates formatted labels in the expression to + the specified length. Default None does not truncate. + + Returns: + A polars Expression. + """ + return _clean_expr_names( + obj=self._expr, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + enforce_string=enforce_string, + truncate_limit=truncate_limit, + ) diff --git a/janitor/polars/lazyframe.py b/janitor/polars/lazyframe.py new file mode 100644 index 000000000..67f2f4222 --- /dev/null +++ b/janitor/polars/lazyframe.py @@ -0,0 +1,314 @@ +from __future__ import annotations + +from polars.type_aliases import ColumnNameOrSelector + +from janitor.utils import import_message + +from .clean_names import _clean_column_names +from .pivot_longer import _pivot_longer +from .row_to_names import _row_to_names + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@pl.api.register_lazyframe_namespace("janitor") +class PolarsLazyFrame: + def __init__(self, df: pl.LazyFrame) -> pl.LazyFrame: + self._df = df + + def clean_names( + self, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + truncate_limit: int = None, + ) -> pl.LazyFrame: + """ + Clean the column names in a polars LazyFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.LazyFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df.collect() + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + >>> df.janitor.clean_names(remove_special=True).collect() + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores from all + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the column names lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the column names. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. + + Returns: + A polars LazyFrame. + """ # noqa: E501 + return self._df.rename( + lambda col: _clean_column_names( + obj=col, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + truncate_limit=truncate_limit, + ) + ) + + def pivot_longer( + self, + index: ColumnNameOrSelector = None, + column_names: ColumnNameOrSelector = None, + names_to: list | tuple | str = "variable", + values_to: str = "value", + names_sep: str = None, + names_pattern: str = None, + names_transform: pl.Expr = None, + ) -> pl.LazyFrame: + """ + Unpivots a LazyFrame from *wide* to *long* format. + + It is modeled after the `pivot_longer` function in R's tidyr package, + and also takes inspiration from the `melt` function in R's data.table package. + + This function is useful to massage a LazyFrame into a format where + one or more columns are considered measured variables, and all other + columns are considered as identifier variables. + + All measured variables are *unpivoted* (and typically duplicated) along the + row axis. + + For more granular control on the unpivoting, have a look at + `pivot_longer_spec`. + + Examples: + >>> import polars as pl + >>> import polars.selectors as cs + >>> import janitor.polars + >>> df = pl.LazyFrame( + ... { + ... "Sepal.Length": [5.1, 5.9], + ... "Sepal.Width": [3.5, 3.0], + ... "Petal.Length": [1.4, 5.1], + ... "Petal.Width": [0.2, 1.8], + ... "Species": ["setosa", "virginica"], + ... } + ... ) + >>> df.collect() + shape: (2, 5) + ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ + │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ + ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ + │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ + │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ + └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ + + >>> df.janitor.pivot_longer(index = 'Species').collect() + shape: (8, 3) + ┌───────────┬──────────────┬───────┐ + │ Species ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Sepal.Length ┆ 5.1 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ setosa ┆ Sepal.Width ┆ 3.5 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ virginica ┆ Petal.Length ┆ 5.1 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ + │ virginica ┆ Petal.Width ┆ 1.8 │ + └───────────┴──────────────┴───────┘ + + !!! info "New in version 0.28.0" + + Args: + index: Column(s) or selector(s) to use as identifier variables. + column_names: Column(s) or selector(s) to unpivot. + names_to: Name of new column as a string that will contain + what were previously the column names in `column_names`. + The default is `variable` if no value is provided. It can + also be a list/tuple of strings that will serve as new column + names, if `name_sep` or `names_pattern` is provided. + If `.value` is in `names_to`, new column names will be extracted + from part of the existing column names and overrides `values_to`. + values_to: Name of new column as a string that will contain what + were previously the values of the columns in `column_names`. + names_sep: Determines how the column name is broken up, if + `names_to` contains multiple values. It takes the same + specification as polars' `str.split` method. + names_pattern: Determines how the column name is broken up. + It can be a regular expression containing matching groups. + It takes the same + specification as polars' `str.extract_groups` method. + names_transform: Use this option to change the types of columns that + have been transformed to rows. + This does not applies to the values' columns. + Accepts a polars expression or a list of polars expressions. + Applicable only if one of names_sep + or names_pattern is provided. + + Returns: + A polars LazyFrame that has been unpivoted from wide to long + format. + """ # noqa: E501 + return _pivot_longer( + df=self._df, + index=index, + column_names=column_names, + names_pattern=names_pattern, + names_sep=names_sep, + names_to=names_to, + values_to=values_to, + names_transform=names_transform, + ) + + def row_to_names( + self, + row_numbers: int | list = 0, + remove_rows: bool = False, + remove_rows_above: bool = False, + separator: str = "_", + ) -> pl.LazyFrame: + """ + Elevates a row, or rows, to be the column names of a DataFrame. + + Examples: + Replace column names with the first row. + + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.LazyFrame({ + ... "a": ["nums", '6', '9'], + ... "b": ["chars", "x", "y"], + ... }) + >>> df.collect() + shape: (3, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ nums ┆ chars │ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.janitor.row_to_names(0, remove_rows=True).collect() + shape: (2, 2) + ┌──────┬───────┐ + │ nums ┆ chars │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.janitor.row_to_names(row_numbers=[0,1], remove_rows=True).collect() + shape: (1, 2) + ┌────────┬─────────┐ + │ nums_6 ┆ chars_x │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪═════════╡ + │ 9 ┆ y │ + └────────┴─────────┘ + + Remove rows above the elevated row and the elevated row itself. + + >>> df = pl.LazyFrame({ + ... "a": ["bla1", "nums", '6', '9'], + ... "b": ["bla2", "chars", "x", "y"], + ... }) + >>> df.collect() + shape: (4, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ bla1 ┆ bla2 │ + │ nums ┆ chars │ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.janitor.row_to_names(1, remove_rows=True, remove_rows_above=True).collect() + shape: (2, 2) + ┌──────┬───────┐ + │ nums ┆ chars │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + + !!! info "New in version 0.28.0" + + Args: + row_numbers: Position of the row(s) containing the variable names. + Note that indexing starts from 0. It can also be a list. + Defaults to 0 (first row). + remove_rows: Whether the row(s) should be removed from the DataFrame. + remove_rows_above: Whether the row(s) above the selected row should + be removed from the DataFrame. + separator: If `row_numbers` is a list of numbers, this parameter + determines how the labels will be combined into a single string. + + Returns: + A polars LazyFrame. + """ # noqa: E501 + return _row_to_names( + self._df, + row_numbers=row_numbers, + remove_rows=remove_rows, + remove_rows_above=remove_rows_above, + separator=separator, + ) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 15a702e17..6e7024cc7 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -20,6 +20,137 @@ ) +def pivot_longer_spec( + df: pl.DataFrame | pl.LazyFrame, + spec: pl.DataFrame, +) -> pl.DataFrame | pl.LazyFrame: + """ + A declarative interface to pivot a DataFrame + from wide to long form, + where you describe how the data will be unpivoted, + using a DataFrame. This gives you, the user, + more control over the transformation to long form, + using a *spec* DataFrame that describes exactly + how data stored in the column names + becomes variables. + + It can come in handy for situations where + `janitor.polars.pivot_longer` + seems inadequate for the transformation. + + !!! info "New in version 0.28.0" + + Examples: + >>> import pandas as pd + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Sepal.Length": [5.1, 5.9], + ... "Sepal.Width": [3.5, 3.0], + ... "Petal.Length": [1.4, 5.1], + ... "Petal.Width": [0.2, 1.8], + ... "Species": ["setosa", "virginica"], + ... } + ... ) + >>> df + shape: (2, 5) + ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ + │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ + ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ + │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ + │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ + └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ + >>> spec = {'.name':['Sepal.Length','Petal.Length', + ... 'Sepal.Width','Petal.Width'], + ... '.value':['Length','Length','Width','Width'], + ... 'part':['Sepal','Petal','Sepal','Petal']} + >>> spec = pl.DataFrame(spec) + >>> spec + shape: (4, 3) + ┌──────────────┬────────┬───────┐ + │ .name ┆ .value ┆ part │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str │ + ╞══════════════╪════════╪═══════╡ + │ Sepal.Length ┆ Length ┆ Sepal │ + │ Petal.Length ┆ Length ┆ Petal │ + │ Sepal.Width ┆ Width ┆ Sepal │ + │ Petal.Width ┆ Width ┆ Petal │ + └──────────────┴────────┴───────┘ + >>> df.pipe(pivot_longer_spec,spec=spec) + shape: (4, 4) + ┌───────────┬────────┬───────┬───────┐ + │ Species ┆ Length ┆ Width ┆ part │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str │ + ╞═══════════╪════════╪═══════╪═══════╡ + │ setosa ┆ 5.1 ┆ 3.5 ┆ Sepal │ + │ virginica ┆ 5.9 ┆ 3.0 ┆ Sepal │ + │ setosa ┆ 1.4 ┆ 0.2 ┆ Petal │ + │ virginica ┆ 5.1 ┆ 1.8 ┆ Petal │ + └───────────┴────────┴───────┴───────┘ + + Args: + df: The source DataFrame to unpivot. + spec: A specification DataFrame. + At a minimum, the spec DataFrame + must have a `.name` column + and a `.value` column. + The `.name` column should contain the + columns in the source DataFrame that will be + transformed to long form. + The `.value` column gives the name of the column + that the values in the source DataFrame will go into. + Additional columns in the spec DataFrame + should be named to match columns + in the long format of the dataset and contain values + corresponding to columns pivoted from the wide format. + Note that these additional columns should not already exist + in the source DataFrame. + + Raises: + KeyError: If `.name` or `.value` is missing from the spec's columns. + ValueError: If the labels in `spec['.name']` is not unique. + + Returns: + A polars DataFrame/LazyFrame. + """ + check("spec", spec, [pl.DataFrame]) + if ".name" not in spec.columns: + raise KeyError( + "Kindly ensure the spec DataFrame has a `.name` column." + ) + if ".value" not in spec.columns: + raise KeyError( + "Kindly ensure the spec DataFrame has a `.value` column." + ) + if spec.select(pl.col(".name").is_duplicated().any()).item(): + raise ValueError("The labels in the `.name` column should be unique.") + + exclude = set(df.columns).intersection(spec.columns) + if exclude: + raise ValueError( + f"Labels {*exclude, } in the spec dataframe already exist " + "as column labels in the source dataframe. " + "Kindly ensure the spec DataFrame's columns " + "are not present in the source DataFrame." + ) + + if spec.columns[:2] != [".name", ".value"]: + raise ValueError( + "The first two columns of the spec DataFrame " + "should be '.name' and '.value', " + "with '.name' coming before '.value'." + ) + + return _pivot_longer_dot_value( + df=df, + spec=spec, + ) + + def _pivot_longer( df: pl.DataFrame | pl.LazyFrame, index: ColumnNameOrSelector, diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py new file mode 100644 index 000000000..7fe1b0b9e --- /dev/null +++ b/janitor/polars/row_to_names.py @@ -0,0 +1,74 @@ +"""clean_names implementation for polars.""" + +from __future__ import annotations + +from janitor.utils import check, import_message + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +def _row_to_names( + df: pl.DataFrame | pl.LazyFrame, + row_numbers: int | list, + remove_rows: bool, + remove_rows_above: bool, + separator: str, +) -> pl.DataFrame | pl.LazyFrame: + """ + Function to convert rows in the DataFrame to column names. + """ + check("separator", separator, [str]) + check("row_numbers", row_numbers, [int, list]) + row_numbers_is_a_list = False + if isinstance(row_numbers, list): + row_numbers_is_a_list = True + for entry in row_numbers: + check("entry in the row_numbers argument", entry, [int]) + expression = ( + pl.all() + .gather(row_numbers) + .cast(pl.String) + .implode() + .list.join(separator=separator) + ) + expression = pl.struct(expression) + else: + expression = pl.all().gather(row_numbers).cast(pl.String) + expression = pl.struct(expression) + mapping = df.select(expression) + if isinstance(mapping, pl.LazyFrame): + mapping = mapping.collect() + mapping = mapping.to_series(0)[0] + df = df.rename(mapping=mapping) + if remove_rows_above: + if row_numbers_is_a_list: + if not pl.Series(row_numbers).diff().drop_nulls().eq(1).all(): + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer, " + "or the integers in a list are consecutive increasing, " + "with a difference of 1." + ) + if remove_rows: + tail = row_numbers[-1] if row_numbers_is_a_list else row_numbers + tail += 1 + else: + tail = row_numbers[0] if row_numbers_is_a_list else row_numbers + df = df.slice(offset=tail) + elif remove_rows: + idx = "".join(df.columns) + df = df.with_row_index(name=idx) + if row_numbers_is_a_list: + df = df.filter(~pl.col(idx).is_in(row_numbers)) + else: + df = df.filter(pl.col(idx) != row_numbers) + df = df.drop(idx) + return df diff --git a/mkdocs.yml b/mkdocs.yml index a7545afc5..3bca9eb4f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -63,7 +63,9 @@ plugins: docstring_style: "google" docstring_options: trim_doctest_flags: true + show_if_no_docstring: false show_root_toc_entry: false + show_root_heading: false show_submodules: true show_source: true # - mknotebooks: diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index 27809fcb2..b7cf79b3f 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -3,6 +3,7 @@ ::: janitor.polars options: members: - - PolarsFrame - - PolarsLazyFrame - - pivot_longer_spec + - PolarsExpr + - PolarsDataFrame + - PolarsLazyFrame + - pivot_longer_spec diff --git a/tests/polars/functions/test_row_to_names_polars.py b/tests/polars/functions/test_row_to_names_polars.py new file mode 100644 index 000000000..be5e07fdd --- /dev/null +++ b/tests/polars/functions/test_row_to_names_polars.py @@ -0,0 +1,160 @@ +import polars as pl +import pytest + +import janitor.polars # noqa: F401 + +df = pl.DataFrame( + { + "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3, + "decorated-elephant": [1, 2, 3] * 3, + "animals@#$%^": ["rabbit", "leopard", "lion"] * 3, + "cities": ["Cambridge", "Shanghai", "Basel"] * 3, + } +) + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_separator_type(df): + """ + Raise if separator is not a string + """ + with pytest.raises(TypeError, match="separator should be.+"): + df.janitor.row_to_names([1, 2], separator=1) + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_numbers_type(df): + """ + Raise if row_numbers is not an int/list + """ + with pytest.raises(TypeError, match="row_numbers should be.+"): + df.janitor.row_to_names({1, 2}) + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_numbers_list_type(df): + """ + Raise if row_numbers is a list + and one of the entries is not an integer. + """ + with pytest.raises( + TypeError, match="entry in the row_numbers argument should be.+" + ): + df.janitor.row_to_names(["1", 2]) + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names(df): + df = df.janitor.row_to_names(2) + assert df.columns[0] == "3.2346125" + assert df.columns[1] == "3" + assert df.columns[2] == "lion" + assert df.columns[3] == "Basel" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_single_list(df): + "Test output if row_numbers is a list, and contains a single item." + df = df.janitor.row_to_names([2]) + assert df.columns[0] == "3.2346125" + assert df.columns[1] == "3" + assert df.columns[2] == "lion" + assert df.columns[3] == "Basel" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_list(df): + "Test output if row_numbers is a list." + df = df.janitor.row_to_names([1, 2]) + assert df.columns[0] == "2.456234_3.2346125" + assert df.columns[1] == "2_3" + assert df.columns[2] == "leopard_lion" + assert df.columns[3] == "Shanghai_Basel" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_delete_this_row(df): + df = df.janitor.row_to_names(2, remove_rows=True) + if isinstance(df, pl.LazyFrame): + df = df.collect() + assert df.to_series(0)[0] == 1.234_523_45 + assert df.to_series(1)[0] == 1 + assert df.to_series(2)[0] == "rabbit" + assert df.to_series(3)[0] == "Cambridge" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_list_delete_this_row(df): + df = df.janitor.row_to_names([2], remove_rows=True) + if isinstance(df, pl.LazyFrame): + df = df.collect() + assert df.to_series(0)[0] == 1.234_523_45 + assert df.to_series(1)[0] == 1 + assert df.to_series(2)[0] == "rabbit" + assert df.to_series(3)[0] == "Cambridge" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_delete_above(df): + df = df.janitor.row_to_names(2, remove_rows_above=True) + if isinstance(df, pl.LazyFrame): + df = df.collect() + assert df.to_series(0)[0] == 3.234_612_5 + assert df.to_series(1)[0] == 3 + assert df.to_series(2)[0] == "lion" + assert df.to_series(3)[0] == "Basel" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_delete_above_list(df): + "Test output if row_numbers is a list" + df = df.janitor.row_to_names([2, 3], remove_rows_above=True) + if isinstance(df, pl.LazyFrame): + df = df.collect() + assert df.to_series(0)[0] == 3.234_612_5 + assert df.to_series(1)[0] == 3 + assert df.to_series(2)[0] == "lion" + assert df.to_series(3)[0] == "Basel" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_delete_above_delete_rows(df): + """ + Test output for remove_rows=True + and remove_rows_above=True + """ + df = df.janitor.row_to_names( + [2, 3], remove_rows=True, remove_rows_above=True + ) + if isinstance(df, pl.LazyFrame): + df = df.collect() + assert df.to_series(0)[0] == 2.456234 + assert df.to_series(1)[0] == 2 + assert df.to_series(2)[0] == "leopard" + assert df.to_series(3)[0] == "Shanghai" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_delete_above_delete_rows_scalar(df): + """ + Test output for remove_rows=True + and remove_rows_above=True + """ + df = df.janitor.row_to_names(2, remove_rows=True, remove_rows_above=True) + if isinstance(df, pl.LazyFrame): + df = df.collect() + assert df.to_series(0)[0] == 1.23452345 + assert df.to_series(1)[0] == 1 + assert df.to_series(2)[0] == "rabbit" + assert df.to_series(3)[0] == "Cambridge" + + +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_delete_above_list_non_consecutive(df): + "Raise if row_numbers is a list, but non consecutive" + msg = "The remove_rows_above argument is applicable " + msg += "only if the row_numbers argument is an integer, " + msg += "or the integers in a list are consecutive increasing, " + msg += "with a difference of 1." + with pytest.raises(ValueError, match=msg): + df.janitor.row_to_names([1, 3], remove_rows_above=True)