From ed3ff6772f7e4c6c8bcb5dfa553bad91164f560c Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 17 Jun 2024 20:28:40 +1000 Subject: [PATCH 01/21] faster pivot_longer for non dot value --- janitor/polars/pivot_longer.py | 817 ++++++++++++------ .../functions/test_pivot_longer_polars.py | 141 ++- 2 files changed, 613 insertions(+), 345 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 6e7024cc7..108670419 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -2,14 +2,10 @@ from __future__ import annotations -from collections import defaultdict -from typing import Any, Iterable - from janitor.utils import check, import_message try: import polars as pl - import polars.selectors as cs from polars.type_aliases import ColumnNameOrSelector except ImportError: import_message( @@ -138,24 +134,23 @@ def pivot_longer_spec( "are not present in the source DataFrame." ) - if spec.columns[:2] != [".name", ".value"]: - raise ValueError( - "The first two columns of the spec DataFrame " - "should be '.name' and '.value', " - "with '.name' coming before '.value'." - ) + df_columns = pl.DataFrame({".name": df.columns}) - return _pivot_longer_dot_value( - df=df, - spec=spec, - ) + spec = df_columns.join(spec, on=".name", how="left") + spec = spec.select(pl.exclude(".name")) + if len(spec.columns) == 1: + return _pivot_longer_dot_value_only( + df=df, + outcome=spec, + ) + return def _pivot_longer( df: pl.DataFrame | pl.LazyFrame, index: ColumnNameOrSelector, column_names: ColumnNameOrSelector, - names_to: list | tuple | str, + names_to: list | tuple | str | None, values_to: str, names_sep: str, names_pattern: str, @@ -165,29 +160,6 @@ def _pivot_longer( Unpivots a DataFrame/LazyFrame from wide to long form. """ - ( - df, - index, - column_names, - names_to, - values_to, - names_sep, - names_pattern, - names_transform, - ) = _data_checks_pivot_longer( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - values_to=values_to, - names_sep=names_sep, - names_pattern=names_pattern, - names_transform=names_transform, - ) - - if not column_names: - return df - if all((names_pattern is None, names_sep is None)): return df.melt( id_vars=index, @@ -196,50 +168,180 @@ def _pivot_longer( value_name=values_to, ) - df = df.select(pl.col(index), pl.col(column_names)) if isinstance(names_to, str): names_to = [names_to] + elif isinstance(names_to, (list, tuple)): + uniques = set() + for word in names_to: + if not isinstance(word, str): + raise TypeError( + f"'{word}' in names_to should be a string type; " + f"instead got type {type(word).__name__}" + ) + if (word in uniques) and (word != ".value"): + raise ValueError(f"'{word}' is duplicated in names_to.") + uniques.add(word) + else: + raise TypeError( + "names_to should be a string, list, or tuple; " + f"instead got type {type(names_to).__name__}" + ) + + if names_sep and names_pattern: + raise ValueError( + "Only one of names_pattern or names_sep should be provided." + ) + + if names_sep is not None: + check("names_sep", names_sep, [str]) - spec = _pivot_longer_create_spec( + else: + check("names_pattern", names_pattern, [str]) + + check("values_to", values_to, [str]) + + if names_sep and (".value" not in names_to): + return _pivot_longer_names_sep_no_dot_value( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + values_to=values_to, + names_sep=names_sep, + names_transform=names_transform, + ) + if names_pattern and (".value" not in names_to): + return _pivot_longer_names_pattern_no_dot_value( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + values_to=values_to, + names_pattern=names_pattern, + names_transform=names_transform, + ) + if names_sep: + return _pivot_longer_names_sep_dot_value( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + names_sep=names_sep, + names_transform=names_transform, + ) + return _pivot_longer_names_pattern_dot_value( + df=df, + index=index, column_names=column_names, names_to=names_to, - names_sep=names_sep, names_pattern=names_pattern, - values_to=values_to, names_transform=names_transform, ) - return _pivot_longer_dot_value(df=df, spec=spec) - -def _pivot_longer_create_spec( - column_names: Iterable, - names_to: Iterable, - names_sep: str | None, - names_pattern: str | None, +def _pivot_longer_names_sep_no_dot_value( + df: pl.DataFrame | pl.LazyFrame, + index: ColumnNameOrSelector, + column_names: ColumnNameOrSelector, + names_to: list | tuple, values_to: str, + names_sep: str, names_transform: pl.Expr, -) -> pl.DataFrame: +) -> pl.DataFrame | pl.LazyFrame: """ - This is where the spec DataFrame is created, - before the transformation to long form. + flip polars Frame to long form, + if names_sep and no .value in names_to. """ - spec = pl.DataFrame({".name": column_names}) - if names_sep is not None: - expression = ( - pl.col(".name") + variable_name = "".join(df.columns) + # the implode approach is used here + # for efficiency + # it is much faster to extract the relevant strings + # on a smaller set and then explode + # than to melt into the full data and then extract + outcome = ( + df.select(pl.all().implode()) + .melt( + id_vars=index, + value_vars=column_names, + variable_name=variable_name, + value_name=values_to, + ) + .with_columns( + pl.col(variable_name) .str.split(by=names_sep) - .list.to_struct(n_field_strategy="max_width") - .alias("extract") + .list.to_struct(n_field_strategy="max_width"), ) + ) + if isinstance(df, pl.LazyFrame): + extract = outcome.select(variable_name).collect().to_series(0) else: - expression = ( - pl.col(".name") - .str.extract_groups(pattern=names_pattern) - .alias("extract") + extract = outcome.get_column(variable_name) + + len_names_to = len(names_to) + + len_fields = len(extract.struct.fields) + + if len_names_to != len_fields: + raise ValueError( + f"The length of names_to does not match " + "the number of fields extracted. " + f"The length of names_to is {len_names_to} " + "while the number of fields extracted is " + f"{len_fields}." ) - spec = spec.with_columns(expression) - len_fields = len(spec.get_column("extract").struct.fields) + + expression = pl.col(variable_name).struct.rename_fields(names=names_to) + outcome = outcome.with_columns(expression) + + if isinstance(df, pl.LazyFrame): + # to ensure the unnested columns are available downstream + # in a LazyFrame, a workaround is to reintroduce + # the variable_name column via with_columns + series = outcome.select(variable_name).collect() + outcome = outcome.with_columns(series) + + outcome = outcome.unnest(variable_name) + if names_transform is not None: + outcome = outcome.with_columns(names_transform) + + columns = [name for name in outcome.columns if name not in names_to] + outcome = outcome.explode(columns=columns) + return outcome + + +def _pivot_longer_names_pattern_no_dot_value( + df: pl.DataFrame | pl.LazyFrame, + index: ColumnNameOrSelector, + column_names: ColumnNameOrSelector, + names_to: list | tuple, + values_to: str, + names_pattern: str, + names_transform: pl.Expr, +) -> pl.DataFrame | pl.LazyFrame: + """ + flip polars Frame to long form, + if names_pattern and no .value in names_to. + """ + variable_name = "".join(df.columns) + outcome = df.select(pl.all().implode()) + outcome = outcome.melt( + id_vars=index, + value_vars=column_names, + variable_name=variable_name, + value_name=values_to, + ) + alias = outcome.columns + alias = "".join(alias) + alias = f"{alias}_" + expression = pl.col(variable_name) + expression = expression.str.extract_groups(pattern=names_pattern) + expression = expression.alias(alias) + outcome = outcome.with_columns(expression) + extract = outcome.select(alias, variable_name) + is_a_lazyframe = isinstance(df, pl.LazyFrame) + if is_a_lazyframe: + extract = extract.collect() + len_fields = len(extract.get_column(alias).struct.fields) len_names_to = len(names_to) if len_names_to != len_fields: @@ -250,220 +352,425 @@ def _pivot_longer_create_spec( "while the number of fields extracted is " f"{len_fields}." ) - if names_pattern is not None: - expression = pl.exclude(".name").is_null().any() - expression = pl.any_horizontal(expression) - null_check = ( - spec.unnest(columns="extract") - .filter(expression) - .get_column(".name") + expression = pl.exclude(variable_name).is_null().any() + expression = pl.any_horizontal(expression) + null_check = ( + extract.unnest(alias).filter(expression).get_column(variable_name) + ) + if null_check.len(): + column_name = null_check.gather(0).item() + raise ValueError( + f"Column label '{column_name}' " + "could not be matched with any of the groups " + "in the provided regex. Kindly provide a regular expression " + "(with the correct groups) that matches all labels in the columns." ) - if null_check.len(): - column_name = null_check.gather(0).item() - raise ValueError( - f"Column label '{column_name}' " - "could not be matched with any of the groups " - "in the provided regex. Kindly provide a regular expression " - "(with the correct groups) that matches all labels in the columns." - ) - if names_to.count(".value") < 2: - expression = pl.col("extract").struct.rename_fields(names=names_to) - spec = spec.with_columns(expression).unnest(columns="extract") - else: - spec = _squash_multiple_dot_value(spec=spec, names_to=names_to) - if ".value" not in names_to: - expression = pl.lit(value=values_to).alias(".value") - spec = spec.with_columns(expression) - spec = spec.select( - pl.col([".name", ".value"]), pl.exclude([".name", ".value"]) - ) + expression = pl.col(alias).struct.rename_fields(names=names_to) + outcome = outcome.with_columns(expression) + + outcome = outcome.select(pl.exclude(variable_name)) + if is_a_lazyframe: + series = outcome.select(alias).collect() + outcome = outcome.with_columns(series) + outcome = outcome.unnest(alias) if names_transform is not None: - spec = spec.with_columns(names_transform) - return spec + outcome = outcome.with_columns(names_transform) + + columns = [name for name in outcome.columns if name not in names_to] + outcome = outcome.explode(columns=columns) + return outcome -def _pivot_longer_dot_value( - df: pl.DataFrame | pl.LazyFrame, spec: pl.DataFrame +def _pivot_longer_names_sep_dot_value( + df: pl.DataFrame | pl.LazyFrame, + index: ColumnNameOrSelector, + column_names: ColumnNameOrSelector, + names_to: list | tuple, + names_sep: str, + names_transform: pl.Expr, ) -> pl.DataFrame | pl.LazyFrame: """ - Reshape DataFrame to long form based on metadata in `spec`. + flip polars Frame to long form, + if names_sep and .value in names_to. """ - index = [column for column in df.columns if column not in spec[".name"]] - not_dot_value = [ - column for column in spec.columns if column not in {".name", ".value"} - ] - idx = "".join(spec.columns) - if not_dot_value: - # assign a number to each group (grouped by not_dot_value) - expression = pl.first(idx).over(not_dot_value).rank("dense").sub(1) - spec = spec.with_row_index(name=idx).with_columns(expression) - else: - # use a cumulative count to properly pair the columns - # grouped by .value - expression = pl.cum_count(".value").over(".value").alias(idx) - spec = spec.with_columns(expression) - mapping = defaultdict(list) - for position, column_name, replacement_name in zip( - spec.get_column(name=idx), - spec.get_column(name=".name"), - spec.get_column(name=".value"), - ): - expression = pl.col(column_name).alias(replacement_name) - mapping[position].append(expression) - - mapping = ( - ( - [ - *index, - *columns_to_select, - ], - pl.lit(position, dtype=pl.UInt32).alias(idx), + + variable_name = "".join(df.columns) + value_name = f"{''.join(df.columns)}_" + outcome = _names_sep_reshape( + df=df, + index=index, + variable_name=variable_name, + column_names=column_names, + names_to=names_to, + value_name=value_name, + names_sep=names_sep, + names_transform=names_transform, + ) + + others = [name for name in names_to if name != ".value"] + if others: + return _pivot_longer_dot_value_others( + df=df, + outcome=outcome, + value_name=value_name, + others=others, ) - for position, columns_to_select in mapping.items() + return _pivot_longer_dot_value_only( + df=df, + outcome=outcome, + variable_name=variable_name, + value_name=value_name, ) - df = [ - df.select(columns_to_select).with_columns(position) - for columns_to_select, position in mapping - ] - # rechunking can be expensive; - # however subsequent operations are faster - # since data is contiguous in memory - df = pl.concat(df, how="diagonal_relaxed", rechunk=True) - expression = pl.cum_count(".value").over(".value").eq(1) - dot_value = spec.filter(expression).select(".value") - columns_to_select = [*index, *dot_value.to_series(0)] - if not_dot_value: - if isinstance(df, pl.LazyFrame): - ranges = df.select(idx).collect().get_column(idx) - else: - ranges = df.get_column(idx) - spec = spec.select(pl.struct(not_dot_value)) - _value = spec.columns[0] - expression = pl.cum_count(_value).over(_value).eq(1) - # using a gather approach, instead of a join - # offers more performance - not sure why - # maybe in the join there is another rechunking? - spec = spec.filter(expression).select(pl.col(_value).gather(ranges)) - df = df.with_columns(spec).unnest(_value) - columns_to_select.extend(not_dot_value) - return df.select(columns_to_select) - - -def _squash_multiple_dot_value( - spec: pl.DataFrame, names_to: Iterable -) -> pl.DataFrame: + + +def _pivot_longer_names_pattern_dot_value( + df: pl.DataFrame | pl.LazyFrame, + index: ColumnNameOrSelector, + column_names: ColumnNameOrSelector, + names_to: list | tuple, + names_pattern: str, + names_transform: pl.Expr, +) -> pl.DataFrame | pl.LazyFrame: """ - Combine multiple .values into a single .value column + flip polars Frame to long form, + if names_pattern and .value in names_to. """ - extract = spec.get_column("extract") - fields = extract.struct.fields - dot_value = [ - field for field, label in zip(fields, names_to) if label == ".value" - ] - dot_value = pl.concat_str(dot_value).alias(".value") - not_dot_value = [ - pl.col(field).alias(label) - for field, label in zip(fields, names_to) - if label != ".value" - ] - select_expr = [".name", dot_value] - if not_dot_value: - select_expr.extend(not_dot_value) - - return spec.unnest("extract").select(select_expr) - - -def _data_checks_pivot_longer( - df, - index, - column_names, - names_to, - values_to, - names_sep, - names_pattern, - names_transform, -) -> tuple: + + variable_name = "".join(df.columns) + value_name = f"{''.join(df.columns)}_" + outcome = _names_pattern_reshape( + df=df, + index=index, + variable_name=variable_name, + column_names=column_names, + names_to=names_to, + value_name=value_name, + names_pattern=names_pattern, + names_transform=names_transform, + ) + + others = [name for name in names_to if name != ".value"] + if others: + return _pivot_longer_dot_value_others( + df=df, + outcome=outcome, + value_name=value_name, + others=others, + ) + return _pivot_longer_dot_value_only( + df=df, + outcome=outcome, + value_name=value_name, + ) + + +def _pivot_longer_dot_value_only( + df: pl.DataFrame | pl.LazyFrame, + outcome: pl.DataFrame | pl.LazyFrame, + value_name: str, +) -> pl.DataFrame | pl.LazyFrame: """ - This function majorly does type checks on the passed arguments. + Pivot to long form if '.value' only + """ + # for .value reshaping, each sub Frame + # should have the same columns + # the code below creates a DataFrame of unique values + # (here we use cumcount to ensure uniqueness) + alias = "".join(outcome.columns) + expression = pl.cum_count(".value").over(".value").alias(alias) + outcome = outcome.with_columns(expression) + expr1 = pl.col(".value").unique().sort().implode() + expr2 = pl.col(alias).unique().sort().implode() + uniqs = outcome.select(expr1, expr2) + uniqs = uniqs.explode(".value") + uniqs = uniqs.explode(alias) + # uniqs is then joined to `outcome` + # to ensure all groups have the labels in .value + # this may introduce nulls if not all groups + # shared the same labels in .value prior to the join - + # the null check below handles that + outcome = uniqs.join(outcome, on=uniqs.columns, how="left") + # patch to deal with nulls + expression = pl.col(value_name).is_null().any() + null_check = outcome.select(expression) + is_a_lazyframe = isinstance(df, pl.LazyFrame) + if is_a_lazyframe: + null_check = null_check.collect() + null_check = null_check.item() + if null_check: + variable_name = "".join(outcome.columns) + expr1 = pl.lit(None).alias(variable_name) + expr2 = pl.implode(variable_name) + nulls = df.with_columns(expr1).select(expr2) + if is_a_lazyframe: + nulls = nulls.collect() + nulls = nulls.to_series(0) + expression = pl.col(value_name).fill_null(nulls) + outcome = outcome.with_columns(expression) + + index = [ + label + for label in outcome.columns + if label not in {alias, value_name, ".value"} + ] + # due to the implodes, index, if present is repeated + # however, we need index to be unique, + # hence the selection of only the first entry + # from the duplicated(repeated) index values in the list + agg_ = [pl.first(index), pl.col(".value"), pl.col(value_name)] + outcome = outcome.group_by(alias, maintain_order=True).agg(agg_) + # since all groups have the same labels in '.value' + # and order is assured in the group_by operation + # we just grab only the first row + # which will serve as headers of the new columns with values + fields = outcome.select(pl.first(".value")) + if is_a_lazyframe: + fields = fields.collect() + fields = fields.item().to_list() + + outcome = outcome.select(pl.exclude(".value")) + expression = pl.col(value_name).list.to_struct( + n_field_strategy="max_width", fields=fields + ) + outcome = outcome.with_columns(expression) + if is_a_lazyframe: + # to ensure the unnested columns are available downstream + # in a LazyFrame, a workaround is to reintroduce + # the value_name column via with_columns + series = outcome.select(value_name).collect() + outcome = outcome.with_columns(series) + outcome = ( + outcome.unnest(value_name) + .explode([*index, *fields]) + .select(pl.exclude(alias)) + ) + return outcome - This function is executed before proceeding to the computation phase. - Type annotations are not provided because this function is where type - checking happens. +def _pivot_longer_dot_value_others( + df: pl.DataFrame | pl.LazyFrame, + outcome: pl.DataFrame | pl.LazyFrame, + value_name: str, + others: list, +) -> pl.DataFrame | pl.LazyFrame: """ + Pivot to long form if '.value' + and `others`. + """ + # logic breakdown is similar to _pivot_longer_dot_value_only + expr1 = pl.struct(others).unique().sort().implode() + expr2 = pl.col(".value").unique().sort().implode() + uniqs = outcome.select(expr1, expr2) + uniqs = uniqs.explode(others[0]) + uniqs = uniqs.explode(".value") + uniqs = uniqs.unnest(others[0]) + + outcome = uniqs.join(outcome, on=uniqs.columns, how="left") + + expression = pl.col(value_name).is_null().any() + null_check = outcome.select(expression) + is_a_lazyframe = isinstance(df, pl.LazyFrame) + if is_a_lazyframe: + null_check = null_check.collect() + null_check = null_check.item() + if null_check: + variable_name = "".join(outcome.columns) + expr1 = pl.lit(None).alias(variable_name) + expr2 = pl.implode(variable_name) + nulls = df.with_columns(expr1).select(expr2) + if is_a_lazyframe: + nulls = nulls.collect() + nulls = nulls.to_series(0) + expression = pl.col(value_name).fill_null(nulls) + outcome = outcome.with_columns(expression) + + index = [ + label + for label in outcome.columns + if label not in {*others, value_name, ".value"} + ] + agg_ = [pl.first(index), pl.col(".value"), pl.col(value_name)] + outcome = outcome.group_by(others, maintain_order=True).agg(agg_) - def _check_type(arg_name: str, arg_value: Any): - """ - Raise if argument is not a valid type - """ - - def _check_type_single(entry): - if ( - not isinstance(entry, str) - and not cs.is_selector(entry) - and not isinstance(entry, pl.Expr) - ): - raise TypeError( - f"The argument passed to the {arg_name} parameter " - "should be a type that is supported in the polars' " - "select function." - ) + fields = outcome.select(pl.first(".value")) + if is_a_lazyframe: + fields = fields.collect() + fields = fields.item().to_list() - if isinstance(arg_value, (list, tuple)): - for entry in arg_value: - _check_type_single(entry=entry) - else: - _check_type_single(entry=arg_value) - - if (index is None) and (column_names is None): - column_names = df.columns - index = [] - elif (index is not None) and (column_names is not None): - _check_type(arg_name="index", arg_value=index) - index = df.select(index).columns - _check_type(arg_name="column_names", arg_value=column_names) - column_names = df.select(column_names).columns - - elif (index is None) and (column_names is not None): - _check_type(arg_name="column_names", arg_value=column_names) - column_names = df.select(column_names).columns - index = df.select(pl.exclude(column_names)).columns - - elif (index is not None) and (column_names is None): - _check_type(arg_name="index", arg_value=index) - index = df.select(index).columns - column_names = df.select(pl.exclude(index)).columns - - check("names_to", names_to, [list, tuple, str]) - if isinstance(names_to, (list, tuple)): - uniques = set() - for word in names_to: - check(f"'{word}' in names_to", word, [str]) - if (word in uniques) and (word != ".value"): - raise ValueError(f"'{word}' is duplicated in names_to.") - uniques.add(word) + outcome = outcome.select(pl.exclude(".value")) + expression = pl.col(value_name).list.to_struct( + n_field_strategy="max_width", fields=fields + ) - if names_sep and names_pattern: + outcome = outcome.with_columns(expression) + if is_a_lazyframe: + series = outcome.select(value_name).collect() + outcome = outcome.with_columns(series) + outcome = outcome.unnest(value_name).explode([*index, *fields]) + + return outcome + + +def _names_sep_reshape( + df: pl.DataFrame | pl.LazyFrame, + index: ColumnNameOrSelector, + column_names: ColumnNameOrSelector, + names_to: list | tuple, + variable_name: str, + value_name: str, + names_sep: str, + names_transform: pl.Expr, +) -> pl.DataFrame | pl.LazyFrame: + # the implode approach is used here + # for efficiency + # it is much faster to extract the relevant strings + # on a smaller set and then explode + # than to melt into the full data and then extract + outcome = ( + df.select(pl.all().implode()) + .melt( + id_vars=index, + value_vars=column_names, + variable_name=variable_name, + value_name=value_name, + ) + .with_columns( + pl.col(variable_name) + .str.split(by=names_sep) + .list.to_struct(n_field_strategy="max_width"), + ) + ) + + if isinstance(df, pl.LazyFrame): + extract = outcome.select(variable_name).collect().to_series(0) + else: + extract = outcome.get_column(variable_name) + + len_names_to = len(names_to) + + len_fields = len(extract.struct.fields) + + if len_names_to != len_fields: raise ValueError( - "Only one of names_pattern or names_sep should be provided." + f"The length of names_to does not match " + "the number of fields extracted. " + f"The length of names_to is {len_names_to} " + "while the number of fields extracted is " + f"{len_fields}." ) - if names_sep is not None: - check("names_sep", names_sep, [str]) + if names_to.count(".value") > 1: + _fields = extract.struct.fields + fields = [ + extract.struct.field(label) + for label, name in zip(_fields, names_to) + if name == ".value" + ] + _value = pl.concat_str(fields).alias(".value") + fields = [ + extract.struct.field(label).alias(name) + for label, name in zip(_fields, names_to) + if name != ".value" + ] + fields.append(_value) + extract = pl.struct(fields).alias(variable_name) + outcome = outcome.with_columns(extract) + else: + expression = pl.col(variable_name).struct.rename_fields(names=names_to) + outcome = outcome.with_columns(expression) + if isinstance(df, pl.LazyFrame): + # to ensure the unnested columns are available downstream + # in a LazyFrame, a workaround is to reintroduce + # the variable_name column via with_columns + series = outcome.select(variable_name).collect() + outcome = outcome.with_columns(series) + outcome = outcome.unnest(variable_name) + if names_transform is not None: + outcome = outcome.with_columns(names_transform) + return outcome - if names_pattern is not None: - check("names_pattern", names_pattern, [str]) - check("values_to", values_to, [str]) +def _names_pattern_reshape( + df: pl.DataFrame | pl.LazyFrame, + index: ColumnNameOrSelector, + column_names: ColumnNameOrSelector, + names_to: list | tuple, + variable_name: str, + value_name: str, + names_pattern: str, + names_transform: pl.Expr, +) -> pl.DataFrame | pl.LazyFrame: + outcome = df.select(pl.all().implode()) + outcome = outcome.melt( + id_vars=index, + value_vars=column_names, + variable_name=variable_name, + value_name=value_name, + ) + alias = outcome.columns + alias = "".join(alias) + alias = f"{alias}_" + outcome = outcome.with_columns( + pl.col(variable_name) + .str.extract_groups(pattern=names_pattern) + .alias(alias) + ) + extract = outcome.select(alias, variable_name) + is_a_lazyframe = isinstance(df, pl.LazyFrame) + if is_a_lazyframe: + extract = extract.collect() + len_fields = len(extract.get_column(alias).struct.fields) + len_names_to = len(names_to) - return ( - df, - index, - column_names, - names_to, - values_to, - names_sep, - names_pattern, - names_transform, + if len_names_to != len_fields: + raise ValueError( + f"The length of names_to does not match " + "the number of fields extracted. " + f"The length of names_to is {len_names_to} " + "while the number of fields extracted is " + f"{len_fields}." + ) + expression = pl.exclude(variable_name).is_null().any() + expression = pl.any_horizontal(expression) + null_check = ( + extract.unnest(alias).filter(expression).get_column(variable_name) ) + if null_check.len(): + column_name = null_check.gather(0).item() + raise ValueError( + f"Column label '{column_name}' " + "could not be matched with any of the groups " + "in the provided regex. Kindly provide a regular expression " + "(with the correct groups) that matches all labels in the columns." + ) + + if names_to.count(".value") > 1: + extract = extract.get_column(alias) + _fields = extract.struct.fields + fields = [ + extract.struct.field(label) + for label, name in zip(_fields, names_to) + if name == ".value" + ] + _value = pl.concat_str(fields).alias(".value") + fields = [ + extract.struct.field(label).alias(name) + for label, name in zip(_fields, names_to) + if name != ".value" + ] + fields.append(_value) + extract = pl.struct(fields).alias(alias) + outcome = outcome.with_columns(extract) + else: + expression = pl.col(alias).struct.rename_fields(names=names_to) + outcome = outcome.with_columns(expression) + + outcome = outcome.select(pl.exclude(variable_name)) + if is_a_lazyframe: + series = outcome.select(alias).collect() + outcome = outcome.with_columns(series) + outcome = outcome.unnest(alias) + if names_transform is not None: + outcome = outcome.with_columns(names_transform) + return outcome diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py index 46bc61c12..4de18f0d8 100644 --- a/tests/polars/functions/test_pivot_longer_polars.py +++ b/tests/polars/functions/test_pivot_longer_polars.py @@ -19,25 +19,9 @@ def df_checks(): ) -def test_type_index(df_checks): - """Raise TypeError if wrong type is provided for the index.""" - msg = "The argument passed to the index parameter " - msg += "should be a type that is supported in the.+" - with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(index=2007, names_sep="_") - - -def test_type_column_names(df_checks): - """Raise TypeError if wrong type is provided for column_names.""" - msg = "The argument passed to the column_names parameter " - msg += "should be a type that is supported in the.+" - with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(column_names=2007, names_sep="_") - - def test_type_names_to(df_checks): """Raise TypeError if wrong type is provided for names_to.""" - msg = "names_to should be one of .+" + msg = "names_to should be a string, list, or tuple.+" with pytest.raises(TypeError, match=msg): df_checks.janitor.pivot_longer(names_to=2007, names_sep="_") @@ -96,38 +80,6 @@ def test_values_to_wrong_type(df_checks): df_checks.janitor.pivot_longer(values_to={"salvo"}, names_sep="_") -def test_pivot_index_only(df_checks): - """Test output if only index is passed.""" - result = df_checks.janitor.pivot_longer( - index=["famid", "birth"], - names_to="dim", - values_to="num", - ) - - actual = df_checks.melt( - id_vars=["famid", "birth"], variable_name="dim", value_name="num" - ) - - assert_frame_equal(result, actual, check_column_order=False) - - -def test_pivot_column_only(df_checks): - """Test output if only column_names is passed.""" - result = df_checks.janitor.pivot_longer( - column_names=["ht1", "ht2"], - names_to="dim", - values_to="num", - ) - - actual = df_checks.melt( - id_vars=["famid", "birth"], - variable_name="dim", - value_name="num", - ) - - assert_frame_equal(result, actual, check_column_order=False) - - def test_names_to_names_pattern_len(df_checks): """ " Raise ValueError @@ -167,12 +119,16 @@ def test_names_pat_str(df_checks): Test output when names_pattern is a string, and .value is present. """ - result = df_checks.janitor.pivot_longer( - column_names=cs.starts_with("ht"), - names_to=(".value", "age"), - names_pattern="(.+)(.)", - names_transform=pl.col("age").cast(pl.Int64), - ).sort(by=pl.all()) + result = ( + df_checks.janitor.pivot_longer( + index=["famid", "birth"], + names_to=(".value", "age"), + names_pattern="(.+)(.)", + names_transform=pl.col("age").cast(pl.Int64), + ) + .select("famid", "birth", "age", "ht") + .sort(by=pl.all()) + ) actual = [ {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, @@ -196,20 +152,7 @@ def test_names_pat_str(df_checks): ] actual = pl.DataFrame(actual).sort(by=pl.all()) - assert_frame_equal( - result, actual, check_dtype=False, check_column_order=False - ) - - -def test_no_column_names(df_checks): - """ - Test output if all the columns - are assigned to the index parameter. - """ - assert_frame_equal( - df_checks.janitor.pivot_longer(index=pl.all()), - df_checks, - ) + assert_frame_equal(result, actual) @pytest.fixture @@ -316,23 +259,37 @@ def test_df(): def test_names_pattern_dot_value(test_df): """Test output for names_pattern and .value.""" - result = test_df.janitor.pivot_longer( - column_names=pl.all(), - names_to=["set", ".value"], - names_pattern="(.+)_(.+)", - ).sort(by=["loc", "lat", "long"]) - assert_frame_equal(result, actual, check_column_order=False) + result = ( + test_df.janitor.pivot_longer( + column_names=cs.all(), + names_to=["set", ".value"], + names_pattern="(.+)_(.+)", + ) + .sort(by=["loc", "lat", "long"]) + .with_columns( + pl.col("lat").cast(pl.Float64), pl.col("long").cast(pl.Float64) + ) + .select("set", "loc", "lat", "long") + ) + assert_frame_equal(result, actual) def test_names_sep_dot_value(test_df): """Test output for names_pattern and .value.""" - result = test_df.janitor.pivot_longer( - column_names=pl.all(), - names_to=["set", ".value"], - names_sep="_", - ).sort(by=["loc", "lat", "long"]) - assert_frame_equal(result, actual, check_column_order=False) + result = ( + test_df.janitor.pivot_longer( + column_names=cs.all(), + names_to=["set", ".value"], + names_sep="_", + ) + .sort(by=["loc", "lat", "long"]) + .with_columns( + pl.col("lat").cast(pl.Float64), pl.col("long").cast(pl.Float64) + ) + .select("set", "loc", "lat", "long") + ) + assert_frame_equal(result, actual) @pytest.fixture @@ -394,7 +351,7 @@ def test_not_dot_value_sep2(not_dot_value): "country", variable_name="event", value_name="score" ) - assert_frame_equal(result, actual, check_column_order=False) + assert_frame_equal(result, actual) def test_not_dot_value_pattern(not_dot_value): @@ -451,6 +408,9 @@ def test_multiple_dot_value(): names_pattern=r"(x|y)_([0-9])(_mean|_sd)", names_transform=pl.col("time").cast(pl.Int64), ) + .with_columns( + pl.col("x_mean").cast(pl.Int64), pl.col("y_mean").cast(pl.Int64) + ) .select("unit", "time", "x_mean", "x_sd", "y_mean", "y_sd") .sort(by=pl.all()) ) @@ -466,7 +426,7 @@ def test_multiple_dot_value(): actual = pl.DataFrame(actual).sort(by=pl.all()) - assert_frame_equal(result, actual, check_column_order=False) + assert_frame_equal(result, actual) @pytest.fixture @@ -512,7 +472,7 @@ def test_names_pattern_single_column(single_val): "id", names_to=".value", names_pattern="(.)." ) - assert_frame_equal(result, actual3, check_column_order=False) + assert_frame_equal(result, actual3) def test_names_pattern_single_column_not_dot_value(single_val): @@ -521,12 +481,11 @@ def test_names_pattern_single_column_not_dot_value(single_val): """ result = single_val.janitor.pivot_longer( index="id", column_names="x1", names_to="yA", names_pattern="(.+)" - ) + ).select("id", "yA", "value") assert_frame_equal( result, single_val.melt(id_vars="id", value_vars="x1", variable_name="yA"), - check_column_order=False, ) @@ -534,14 +493,15 @@ def test_names_pattern_single_column_not_dot_value1(single_val): """ Test output if names_to is not '.value'. """ - result = single_val.select("x1").janitor.pivot_longer( - names_to="yA", names_pattern="(.+)" + result = ( + single_val.select("x1") + .janitor.pivot_longer(names_to="yA", names_pattern="(.+)") + .select("yA", "value") ) assert_frame_equal( result, single_val.select("x1").melt(variable_name="yA"), - check_column_order=False, ) @@ -579,6 +539,7 @@ def test_names_pattern_nulls_in_data(df_null): names_to=[".value", "child"], names_pattern=r"(.+)_(.+)", ) + .with_columns(pl.col("gender").cast(pl.Float64)) .select("family", "child", "dob", "gender") .sort(by=pl.all()) ) @@ -598,4 +559,4 @@ def test_names_pattern_nulls_in_data(df_null): actual = pl.DataFrame(actual).sort(by=pl.all()) - assert_frame_equal(result, actual, check_column_order=False) + assert_frame_equal(result, actual) From 1568143c1ef73fbab043ee8d6e9440d2d803c34e Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 18 Jun 2024 12:53:55 +1000 Subject: [PATCH 02/21] fix docs and tests --- janitor/polars/dataframe.py | 4 +- janitor/polars/pivot_longer.py | 818 ++++++------------ .../functions/test_pivot_longer_polars.py | 14 +- 3 files changed, 272 insertions(+), 564 deletions(-) diff --git a/janitor/polars/dataframe.py b/janitor/polars/dataframe.py index 31a55e468..e3d437d8b 100644 --- a/janitor/polars/dataframe.py +++ b/janitor/polars/dataframe.py @@ -206,10 +206,10 @@ def pivot_longer( │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ f64 ┆ f64 │ ╞═══════════╪═══════╪════════╪═══════╡ - │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ - │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ + │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ └───────────┴───────┴────────┴───────┘ Split the column labels based on regex: diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 108670419..aaf66ff8f 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -38,7 +38,7 @@ def pivot_longer_spec( Examples: >>> import pandas as pd - >>> import janitor.polars + >>> from janitor.polars import pivot_longer_spec >>> df = pl.DataFrame( ... { ... "Sepal.Length": [5.1, 5.9], @@ -77,16 +77,16 @@ def pivot_longer_spec( └──────────────┴────────┴───────┘ >>> df.pipe(pivot_longer_spec,spec=spec) shape: (4, 4) - ┌───────────┬────────┬───────┬───────┐ - │ Species ┆ Length ┆ Width ┆ part │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ str │ - ╞═══════════╪════════╪═══════╪═══════╡ - │ setosa ┆ 5.1 ┆ 3.5 ┆ Sepal │ - │ virginica ┆ 5.9 ┆ 3.0 ┆ Sepal │ - │ setosa ┆ 1.4 ┆ 0.2 ┆ Petal │ - │ virginica ┆ 5.1 ┆ 1.8 ┆ Petal │ - └───────────┴────────┴───────┴───────┘ + ┌───────────┬───────┬────────┬───────┐ + │ Species ┆ part ┆ Length ┆ Width │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 ┆ f64 │ + ╞═══════════╪═══════╪════════╪═══════╡ + │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ + │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ + │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ + └───────────┴───────┴────────┴───────┘ Args: df: The source DataFrame to unpivot. @@ -133,17 +133,30 @@ def pivot_longer_spec( "Kindly ensure the spec DataFrame's columns " "are not present in the source DataFrame." ) - - df_columns = pl.DataFrame({".name": df.columns}) - - spec = df_columns.join(spec, on=".name", how="left") - spec = spec.select(pl.exclude(".name")) - if len(spec.columns) == 1: - return _pivot_longer_dot_value_only( - df=df, - outcome=spec, - ) - return + index = [ + label for label in df.columns if label not in spec.get_column(".name") + ] + others = [ + label for label in spec.columns if label not in {".name", ".value"} + ] + variable_name = "".join(df.columns + spec.columns) + variable_name = f"{variable_name}_" + if others: + dot_value_only = False + expression = pl.struct(others).alias(variable_name) + spec = spec.select(".name", ".value", expression) + else: + dot_value_only = True + expression = pl.cum_count(".value").over(".value").alias(variable_name) + spec = spec.with_columns(expression) + return _pivot_longer_dot_value( + df=df, + index=index, + spec=spec, + variable_name=variable_name, + dot_value_only=dot_value_only, + names_transform=None, + ) def _pivot_longer( @@ -168,119 +181,92 @@ def _pivot_longer( value_name=values_to, ) - if isinstance(names_to, str): - names_to = [names_to] - elif isinstance(names_to, (list, tuple)): - uniques = set() - for word in names_to: - if not isinstance(word, str): - raise TypeError( - f"'{word}' in names_to should be a string type; " - f"instead got type {type(word).__name__}" - ) - if (word in uniques) and (word != ".value"): - raise ValueError(f"'{word}' is duplicated in names_to.") - uniques.add(word) - else: - raise TypeError( - "names_to should be a string, list, or tuple; " - f"instead got type {type(names_to).__name__}" - ) - - if names_sep and names_pattern: - raise ValueError( - "Only one of names_pattern or names_sep should be provided." - ) - - if names_sep is not None: - check("names_sep", names_sep, [str]) - - else: - check("names_pattern", names_pattern, [str]) + ( + df, + index, + column_names, + names_to, + values_to, + names_sep, + names_pattern, + ) = _data_checks_pivot_longer( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + values_to=values_to, + names_sep=names_sep, + names_pattern=names_pattern, + ) - check("values_to", values_to, [str]) + variable_name = "".join(df.columns) + variable_name = f"{variable_name}_" + spec = _pivot_longer_create_spec( + column_names=column_names, + names_to=names_to, + names_sep=names_sep, + names_pattern=names_pattern, + variable_name=variable_name, + ) - if names_sep and (".value" not in names_to): - return _pivot_longer_names_sep_no_dot_value( + if ".value" not in names_to: + return _pivot_longer_no_dot_value( df=df, index=index, + spec=spec, column_names=column_names, names_to=names_to, values_to=values_to, - names_sep=names_sep, - names_transform=names_transform, - ) - if names_pattern and (".value" not in names_to): - return _pivot_longer_names_pattern_no_dot_value( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - values_to=values_to, - names_pattern=names_pattern, - names_transform=names_transform, - ) - if names_sep: - return _pivot_longer_names_sep_dot_value( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - names_sep=names_sep, + variable_name=variable_name, names_transform=names_transform, ) - return _pivot_longer_names_pattern_dot_value( + + if {".name", ".value"}.symmetric_difference(spec.columns): + dot_value_only = False + else: + dot_value_only = True + expression = pl.cum_count(".value").over(".value").alias(variable_name) + spec = spec.with_columns(expression) + + return _pivot_longer_dot_value( df=df, index=index, - column_names=column_names, - names_to=names_to, - names_pattern=names_pattern, + spec=spec, + variable_name=variable_name, + dot_value_only=dot_value_only, names_transform=names_transform, ) -def _pivot_longer_names_sep_no_dot_value( - df: pl.DataFrame | pl.LazyFrame, - index: ColumnNameOrSelector, - column_names: ColumnNameOrSelector, - names_to: list | tuple, - values_to: str, - names_sep: str, - names_transform: pl.Expr, -) -> pl.DataFrame | pl.LazyFrame: +def _pivot_longer_create_spec( + column_names: list, + names_to: list, + names_sep: str | None, + names_pattern: str | None, + variable_name: str, +) -> pl.DataFrame: """ - flip polars Frame to long form, - if names_sep and no .value in names_to. + This is where the spec DataFrame is created, + before the transformation to long form. """ - variable_name = "".join(df.columns) - # the implode approach is used here - # for efficiency - # it is much faster to extract the relevant strings - # on a smaller set and then explode - # than to melt into the full data and then extract - outcome = ( - df.select(pl.all().implode()) - .melt( - id_vars=index, - value_vars=column_names, - variable_name=variable_name, - value_name=values_to, - ) - .with_columns( - pl.col(variable_name) + spec = pl.DataFrame({".name": column_names}) + if names_sep is not None: + expression = ( + pl.col(".name") .str.split(by=names_sep) - .list.to_struct(n_field_strategy="max_width"), + .list.to_struct(n_field_strategy="max_width") + .alias(variable_name) ) - ) - if isinstance(df, pl.LazyFrame): - extract = outcome.select(variable_name).collect().to_series(0) else: - extract = outcome.get_column(variable_name) - + expression = ( + pl.col(".name") + .str.extract_groups(pattern=names_pattern) + .alias(variable_name) + ) + spec = spec.with_columns(expression) + len_fields = len(spec.get_column(variable_name).struct.fields) len_names_to = len(names_to) - len_fields = len(extract.struct.fields) - if len_names_to != len_fields: raise ValueError( f"The length of names_to does not match " @@ -289,488 +275,220 @@ def _pivot_longer_names_sep_no_dot_value( "while the number of fields extracted is " f"{len_fields}." ) - - expression = pl.col(variable_name).struct.rename_fields(names=names_to) - outcome = outcome.with_columns(expression) - - if isinstance(df, pl.LazyFrame): - # to ensure the unnested columns are available downstream - # in a LazyFrame, a workaround is to reintroduce - # the variable_name column via with_columns - series = outcome.select(variable_name).collect() - outcome = outcome.with_columns(series) - - outcome = outcome.unnest(variable_name) - if names_transform is not None: - outcome = outcome.with_columns(names_transform) - - columns = [name for name in outcome.columns if name not in names_to] - outcome = outcome.explode(columns=columns) - return outcome + if names_pattern is not None: + expression = pl.exclude(".name").is_null().any() + expression = pl.any_horizontal(expression) + null_check = ( + spec.unnest(columns=variable_name) + .filter(expression) + .get_column(".name") + ) + if null_check.len(): + column_name = null_check.gather(0).item() + raise ValueError( + f"Column label '{column_name}' " + "could not be matched with any of the groups " + "in the provided regex. Kindly provide a regular expression " + "(with the correct groups) that matches all labels in the columns." + ) + + if ".value" not in names_to: + spec = spec.get_column(variable_name) + spec = spec.struct.rename_fields(names=names_to) + return spec + if names_to.count(".value") == 1: + spec = spec.with_columns( + pl.col(variable_name).struct.rename_fields(names=names_to) + ) + if ".value" not in names_to: + return spec.get_column(variable_name) + not_dot_value = [name for name in names_to if name != ".value"] + spec = spec.unnest(variable_name) + if not_dot_value: + return spec.select( + ".name", + ".value", + pl.struct(not_dot_value).alias(variable_name), + ) + return spec.select(".name", ".value") + _spec = spec.get_column(variable_name) + _spec = _spec.struct.unnest() + fields = _spec.columns + + if len(set(names_to)) == 1: + expression = pl.concat_str(fields).alias(".value") + dot_value = _spec.select(expression) + dot_value = dot_value.to_series(0) + return spec.select(".name", dot_value) + dot_value = [ + field for field, label in zip(fields, names_to) if label == ".value" + ] + dot_value = pl.concat_str(dot_value).alias(".value") + not_dot_value = [ + pl.col(field).alias(label) + for field, label in zip(fields, names_to) + if label != ".value" + ] + not_dot_value = pl.struct(not_dot_value).alias(variable_name) + return _spec.select(spec.get_column(".name"), not_dot_value, dot_value) -def _pivot_longer_names_pattern_no_dot_value( +def _pivot_longer_no_dot_value( df: pl.DataFrame | pl.LazyFrame, + spec: pl.DataFrame, index: ColumnNameOrSelector, column_names: ColumnNameOrSelector, names_to: list | tuple, values_to: str, - names_pattern: str, + variable_name: str, names_transform: pl.Expr, ) -> pl.DataFrame | pl.LazyFrame: """ flip polars Frame to long form, - if names_pattern and no .value in names_to. + if no .value in names_to. """ - variable_name = "".join(df.columns) - outcome = df.select(pl.all().implode()) - outcome = outcome.melt( - id_vars=index, - value_vars=column_names, - variable_name=variable_name, - value_name=values_to, - ) - alias = outcome.columns - alias = "".join(alias) - alias = f"{alias}_" - expression = pl.col(variable_name) - expression = expression.str.extract_groups(pattern=names_pattern) - expression = expression.alias(alias) - outcome = outcome.with_columns(expression) - extract = outcome.select(alias, variable_name) - is_a_lazyframe = isinstance(df, pl.LazyFrame) - if is_a_lazyframe: - extract = extract.collect() - len_fields = len(extract.get_column(alias).struct.fields) - len_names_to = len(names_to) - - if len_names_to != len_fields: - raise ValueError( - f"The length of names_to does not match " - "the number of fields extracted. " - f"The length of names_to is {len_names_to} " - "while the number of fields extracted is " - f"{len_fields}." + # the implode/explode approach is used here + # for efficiency + # do the operation on a smaller size + # and then blow it up after + # it is usually much faster + # than running on the actual data + outcome = ( + df.select(pl.all().implode()) + .melt( + id_vars=index, + value_vars=column_names, + variable_name=variable_name, + value_name=values_to, ) - expression = pl.exclude(variable_name).is_null().any() - expression = pl.any_horizontal(expression) - null_check = ( - extract.unnest(alias).filter(expression).get_column(variable_name) + .with_columns(spec) ) - if null_check.len(): - column_name = null_check.gather(0).item() - raise ValueError( - f"Column label '{column_name}' " - "could not be matched with any of the groups " - "in the provided regex. Kindly provide a regular expression " - "(with the correct groups) that matches all labels in the columns." - ) - - expression = pl.col(alias).struct.rename_fields(names=names_to) - outcome = outcome.with_columns(expression) - outcome = outcome.select(pl.exclude(variable_name)) - if is_a_lazyframe: - series = outcome.select(alias).collect() - outcome = outcome.with_columns(series) - outcome = outcome.unnest(alias) + outcome = outcome.unnest(variable_name) if names_transform is not None: outcome = outcome.with_columns(names_transform) - columns = [name for name in outcome.columns if name not in names_to] outcome = outcome.explode(columns=columns) return outcome -def _pivot_longer_names_sep_dot_value( +def _pivot_longer_dot_value( df: pl.DataFrame | pl.LazyFrame, + spec: pl.DataFrame, index: ColumnNameOrSelector, - column_names: ColumnNameOrSelector, - names_to: list | tuple, - names_sep: str, + variable_name: str, + dot_value_only: bool, names_transform: pl.Expr, ) -> pl.DataFrame | pl.LazyFrame: """ flip polars Frame to long form, if names_sep and .value in names_to. """ - - variable_name = "".join(df.columns) - value_name = f"{''.join(df.columns)}_" - outcome = _names_sep_reshape( - df=df, - index=index, - variable_name=variable_name, - column_names=column_names, - names_to=names_to, - value_name=value_name, - names_sep=names_sep, - names_transform=names_transform, - ) - - others = [name for name in names_to if name != ".value"] - if others: - return _pivot_longer_dot_value_others( - df=df, - outcome=outcome, - value_name=value_name, - others=others, - ) - return _pivot_longer_dot_value_only( - df=df, - outcome=outcome, - variable_name=variable_name, - value_name=value_name, - ) - - -def _pivot_longer_names_pattern_dot_value( - df: pl.DataFrame | pl.LazyFrame, - index: ColumnNameOrSelector, - column_names: ColumnNameOrSelector, - names_to: list | tuple, - names_pattern: str, - names_transform: pl.Expr, -) -> pl.DataFrame | pl.LazyFrame: - """ - flip polars Frame to long form, - if names_pattern and .value in names_to. - """ - - variable_name = "".join(df.columns) - value_name = f"{''.join(df.columns)}_" - outcome = _names_pattern_reshape( - df=df, - index=index, - variable_name=variable_name, - column_names=column_names, - names_to=names_to, - value_name=value_name, - names_pattern=names_pattern, - names_transform=names_transform, - ) - - others = [name for name in names_to if name != ".value"] - if others: - return _pivot_longer_dot_value_others( - df=df, - outcome=outcome, - value_name=value_name, - others=others, - ) - return _pivot_longer_dot_value_only( - df=df, - outcome=outcome, - value_name=value_name, - ) - - -def _pivot_longer_dot_value_only( - df: pl.DataFrame | pl.LazyFrame, - outcome: pl.DataFrame | pl.LazyFrame, - value_name: str, -) -> pl.DataFrame | pl.LazyFrame: - """ - Pivot to long form if '.value' only - """ - # for .value reshaping, each sub Frame - # should have the same columns - # the code below creates a DataFrame of unique values - # (here we use cumcount to ensure uniqueness) - alias = "".join(outcome.columns) - expression = pl.cum_count(".value").over(".value").alias(alias) - outcome = outcome.with_columns(expression) - expr1 = pl.col(".value").unique().sort().implode() - expr2 = pl.col(alias).unique().sort().implode() - uniqs = outcome.select(expr1, expr2) - uniqs = uniqs.explode(".value") - uniqs = uniqs.explode(alias) - # uniqs is then joined to `outcome` - # to ensure all groups have the labels in .value - # this may introduce nulls if not all groups - # shared the same labels in .value prior to the join - - # the null check below handles that - outcome = uniqs.join(outcome, on=uniqs.columns, how="left") - # patch to deal with nulls - expression = pl.col(value_name).is_null().any() - null_check = outcome.select(expression) - is_a_lazyframe = isinstance(df, pl.LazyFrame) - if is_a_lazyframe: - null_check = null_check.collect() - null_check = null_check.item() - if null_check: - variable_name = "".join(outcome.columns) - expr1 = pl.lit(None).alias(variable_name) - expr2 = pl.implode(variable_name) - nulls = df.with_columns(expr1).select(expr2) - if is_a_lazyframe: - nulls = nulls.collect() - nulls = nulls.to_series(0) - expression = pl.col(value_name).fill_null(nulls) - outcome = outcome.with_columns(expression) - - index = [ - label - for label in outcome.columns - if label not in {alias, value_name, ".value"} - ] - # due to the implodes, index, if present is repeated - # however, we need index to be unique, - # hence the selection of only the first entry - # from the duplicated(repeated) index values in the list - agg_ = [pl.first(index), pl.col(".value"), pl.col(value_name)] - outcome = outcome.group_by(alias, maintain_order=True).agg(agg_) - # since all groups have the same labels in '.value' - # and order is assured in the group_by operation - # we just grab only the first row - # which will serve as headers of the new columns with values - fields = outcome.select(pl.first(".value")) - if is_a_lazyframe: - fields = fields.collect() - fields = fields.item().to_list() - - outcome = outcome.select(pl.exclude(".value")) - expression = pl.col(value_name).list.to_struct( - n_field_strategy="max_width", fields=fields - ) - outcome = outcome.with_columns(expression) - if is_a_lazyframe: - # to ensure the unnested columns are available downstream - # in a LazyFrame, a workaround is to reintroduce - # the value_name column via with_columns - series = outcome.select(value_name).collect() - outcome = outcome.with_columns(series) + spec = spec.group_by(variable_name) + spec = spec.agg(pl.all()) + expressions = [] + for names, fields in zip( + spec.get_column(".name").to_list(), + spec.get_column(".value").to_list(), + ): + expression = pl.struct(names).struct.rename_fields(names=fields) + expressions.append(expression) + expressions = [*index, *expressions] + spec = spec.get_column(variable_name) outcome = ( - outcome.unnest(value_name) - .explode([*index, *fields]) - .select(pl.exclude(alias)) + df.select(expressions) + .select(pl.all().implode()) + .melt(id_vars=index, variable_name=variable_name, value_name=".value") + .with_columns(spec) ) - return outcome - -def _pivot_longer_dot_value_others( - df: pl.DataFrame | pl.LazyFrame, - outcome: pl.DataFrame | pl.LazyFrame, - value_name: str, - others: list, -) -> pl.DataFrame | pl.LazyFrame: - """ - Pivot to long form if '.value' - and `others`. - """ - # logic breakdown is similar to _pivot_longer_dot_value_only - expr1 = pl.struct(others).unique().sort().implode() - expr2 = pl.col(".value").unique().sort().implode() - uniqs = outcome.select(expr1, expr2) - uniqs = uniqs.explode(others[0]) - uniqs = uniqs.explode(".value") - uniqs = uniqs.unnest(others[0]) - - outcome = uniqs.join(outcome, on=uniqs.columns, how="left") - - expression = pl.col(value_name).is_null().any() - null_check = outcome.select(expression) - is_a_lazyframe = isinstance(df, pl.LazyFrame) - if is_a_lazyframe: - null_check = null_check.collect() - null_check = null_check.item() - if null_check: - variable_name = "".join(outcome.columns) - expr1 = pl.lit(None).alias(variable_name) - expr2 = pl.implode(variable_name) - nulls = df.with_columns(expr1).select(expr2) - if is_a_lazyframe: - nulls = nulls.collect() - nulls = nulls.to_series(0) - expression = pl.col(value_name).fill_null(nulls) - outcome = outcome.with_columns(expression) - - index = [ - label - for label in outcome.columns - if label not in {*others, value_name, ".value"} + if dot_value_only: + columns = [ + label for label in outcome.columns if label != variable_name + ] + outcome = outcome.explode(columns).unnest(".value") + outcome = outcome.select(pl.exclude(variable_name)) + return outcome + outcome = outcome.unnest(variable_name) + if names_transform is not None: + outcome = outcome.with_columns(names_transform) + columns = [ + label for label in outcome.columns if label not in spec.struct.fields ] - agg_ = [pl.first(index), pl.col(".value"), pl.col(value_name)] - outcome = outcome.group_by(others, maintain_order=True).agg(agg_) - - fields = outcome.select(pl.first(".value")) - if is_a_lazyframe: - fields = fields.collect() - fields = fields.item().to_list() - - outcome = outcome.select(pl.exclude(".value")) - expression = pl.col(value_name).list.to_struct( - n_field_strategy="max_width", fields=fields - ) - - outcome = outcome.with_columns(expression) - if is_a_lazyframe: - series = outcome.select(value_name).collect() - outcome = outcome.with_columns(series) - outcome = outcome.unnest(value_name).explode([*index, *fields]) + outcome = outcome.explode(columns) + outcome = outcome.unnest(".value") return outcome -def _names_sep_reshape( - df: pl.DataFrame | pl.LazyFrame, - index: ColumnNameOrSelector, - column_names: ColumnNameOrSelector, - names_to: list | tuple, - variable_name: str, - value_name: str, - names_sep: str, - names_transform: pl.Expr, -) -> pl.DataFrame | pl.LazyFrame: - # the implode approach is used here - # for efficiency - # it is much faster to extract the relevant strings - # on a smaller set and then explode - # than to melt into the full data and then extract - outcome = ( - df.select(pl.all().implode()) - .melt( - id_vars=index, - value_vars=column_names, - variable_name=variable_name, - value_name=value_name, - ) - .with_columns( - pl.col(variable_name) - .str.split(by=names_sep) - .list.to_struct(n_field_strategy="max_width"), - ) - ) - - if isinstance(df, pl.LazyFrame): - extract = outcome.select(variable_name).collect().to_series(0) - else: - extract = outcome.get_column(variable_name) +def _data_checks_pivot_longer( + df, + index, + column_names, + names_to, + values_to, + names_sep, + names_pattern, +) -> tuple: + """ + This function majorly does type checks on the passed arguments. - len_names_to = len(names_to) + This function is executed before proceeding to the computation phase. - len_fields = len(extract.struct.fields) + Type annotations are not provided because this function is where type + checking happens. + """ + if isinstance(names_to, str): + names_to = [names_to] + elif isinstance(names_to, (list, tuple)): + uniques = set() + for word in names_to: + if not isinstance(word, str): + raise TypeError( + f"'{word}' in names_to should be a string type; " + f"instead got type {type(word).__name__}" + ) + if (word in uniques) and (word != ".value"): + raise ValueError(f"'{word}' is duplicated in names_to.") + uniques.add(word) + else: + raise TypeError( + "names_to should be a string, list, or tuple; " + f"instead got type {type(names_to).__name__}" + ) - if len_names_to != len_fields: + if names_sep and names_pattern: raise ValueError( - f"The length of names_to does not match " - "the number of fields extracted. " - f"The length of names_to is {len_names_to} " - "while the number of fields extracted is " - f"{len_fields}." + "Only one of names_pattern or names_sep should be provided." ) - if names_to.count(".value") > 1: - _fields = extract.struct.fields - fields = [ - extract.struct.field(label) - for label, name in zip(_fields, names_to) - if name == ".value" - ] - _value = pl.concat_str(fields).alias(".value") - fields = [ - extract.struct.field(label).alias(name) - for label, name in zip(_fields, names_to) - if name != ".value" - ] - fields.append(_value) - extract = pl.struct(fields).alias(variable_name) - outcome = outcome.with_columns(extract) - else: - expression = pl.col(variable_name).struct.rename_fields(names=names_to) - outcome = outcome.with_columns(expression) - if isinstance(df, pl.LazyFrame): - # to ensure the unnested columns are available downstream - # in a LazyFrame, a workaround is to reintroduce - # the variable_name column via with_columns - series = outcome.select(variable_name).collect() - outcome = outcome.with_columns(series) - outcome = outcome.unnest(variable_name) - if names_transform is not None: - outcome = outcome.with_columns(names_transform) - return outcome - + if names_sep is not None: + check("names_sep", names_sep, [str]) -def _names_pattern_reshape( - df: pl.DataFrame | pl.LazyFrame, - index: ColumnNameOrSelector, - column_names: ColumnNameOrSelector, - names_to: list | tuple, - variable_name: str, - value_name: str, - names_pattern: str, - names_transform: pl.Expr, -) -> pl.DataFrame | pl.LazyFrame: - outcome = df.select(pl.all().implode()) - outcome = outcome.melt( - id_vars=index, - value_vars=column_names, - variable_name=variable_name, - value_name=value_name, - ) - alias = outcome.columns - alias = "".join(alias) - alias = f"{alias}_" - outcome = outcome.with_columns( - pl.col(variable_name) - .str.extract_groups(pattern=names_pattern) - .alias(alias) - ) - extract = outcome.select(alias, variable_name) - is_a_lazyframe = isinstance(df, pl.LazyFrame) - if is_a_lazyframe: - extract = extract.collect() - len_fields = len(extract.get_column(alias).struct.fields) - len_names_to = len(names_to) + else: + check("names_pattern", names_pattern, [str]) - if len_names_to != len_fields: - raise ValueError( - f"The length of names_to does not match " - "the number of fields extracted. " - f"The length of names_to is {len_names_to} " - "while the number of fields extracted is " - f"{len_fields}." - ) - expression = pl.exclude(variable_name).is_null().any() - expression = pl.any_horizontal(expression) - null_check = ( - extract.unnest(alias).filter(expression).get_column(variable_name) - ) - if null_check.len(): - column_name = null_check.gather(0).item() - raise ValueError( - f"Column label '{column_name}' " - "could not be matched with any of the groups " - "in the provided regex. Kindly provide a regular expression " - "(with the correct groups) that matches all labels in the columns." - ) + check("values_to", values_to, [str]) - if names_to.count(".value") > 1: - extract = extract.get_column(alias) - _fields = extract.struct.fields - fields = [ - extract.struct.field(label) - for label, name in zip(_fields, names_to) - if name == ".value" - ] - _value = pl.concat_str(fields).alias(".value") - fields = [ - extract.struct.field(label).alias(name) - for label, name in zip(_fields, names_to) - if name != ".value" - ] - fields.append(_value) - extract = pl.struct(fields).alias(alias) - outcome = outcome.with_columns(extract) + if (index is None) and (column_names is None): + column_names = df.columns + index = [] + elif (index is None) and (column_names is not None): + column_names = df.select(column_names).columns + index = df.select(pl.exclude(column_names)).columns + elif (index is not None) and (column_names is None): + index = df.select(index).columns + column_names = df.select(pl.exclude(index)).columns else: - expression = pl.col(alias).struct.rename_fields(names=names_to) - outcome = outcome.with_columns(expression) - - outcome = outcome.select(pl.exclude(variable_name)) - if is_a_lazyframe: - series = outcome.select(alias).collect() - outcome = outcome.with_columns(series) - outcome = outcome.unnest(alias) - if names_transform is not None: - outcome = outcome.with_columns(names_transform) - return outcome + index = df.select(index).columns + column_names = df.select(column_names).columns + + return ( + df, + index, + column_names, + names_to, + values_to, + names_sep, + names_pattern, + ) diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py index 4de18f0d8..d062aac12 100644 --- a/tests/polars/functions/test_pivot_longer_polars.py +++ b/tests/polars/functions/test_pivot_longer_polars.py @@ -266,9 +266,6 @@ def test_names_pattern_dot_value(test_df): names_pattern="(.+)_(.+)", ) .sort(by=["loc", "lat", "long"]) - .with_columns( - pl.col("lat").cast(pl.Float64), pl.col("long").cast(pl.Float64) - ) .select("set", "loc", "lat", "long") ) assert_frame_equal(result, actual) @@ -284,9 +281,6 @@ def test_names_sep_dot_value(test_df): names_sep="_", ) .sort(by=["loc", "lat", "long"]) - .with_columns( - pl.col("lat").cast(pl.Float64), pl.col("long").cast(pl.Float64) - ) .select("set", "loc", "lat", "long") ) assert_frame_equal(result, actual) @@ -408,9 +402,6 @@ def test_multiple_dot_value(): names_pattern=r"(x|y)_([0-9])(_mean|_sd)", names_transform=pl.col("time").cast(pl.Int64), ) - .with_columns( - pl.col("x_mean").cast(pl.Int64), pl.col("y_mean").cast(pl.Int64) - ) .select("unit", "time", "x_mean", "x_sd", "y_mean", "y_sd") .sort(by=pl.all()) ) @@ -448,7 +439,7 @@ def test_multiple_dot_value2(single_val): index="id", names_to=(".value", ".value"), names_pattern="(.)(.)" ) - assert_frame_equal(result, single_val, check_column_order=False) + assert_frame_equal(result, single_val) actual3 = [ @@ -472,7 +463,7 @@ def test_names_pattern_single_column(single_val): "id", names_to=".value", names_pattern="(.)." ) - assert_frame_equal(result, actual3) + assert_frame_equal(result.sort(by=pl.all()), actual3.sort(by=pl.all())) def test_names_pattern_single_column_not_dot_value(single_val): @@ -539,7 +530,6 @@ def test_names_pattern_nulls_in_data(df_null): names_to=[".value", "child"], names_pattern=r"(.+)_(.+)", ) - .with_columns(pl.col("gender").cast(pl.Float64)) .select("family", "child", "dob", "gender") .sort(by=pl.all()) ) From b5a89a911229053ae8b58ee0cbc3c48a7c7fca24 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 18 Jun 2024 13:11:30 +1000 Subject: [PATCH 03/21] fix docs and tests --- janitor/polars/dataframe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/janitor/polars/dataframe.py b/janitor/polars/dataframe.py index e3d437d8b..ca0feb8ca 100644 --- a/janitor/polars/dataframe.py +++ b/janitor/polars/dataframe.py @@ -177,21 +177,21 @@ def pivot_longer( ... index = 'Species', ... names_to = ('part', 'dimension'), ... names_sep = '.', - ... ).select('Species','part','dimension','value') + ... ).select('Species','part','dimension','value').sort(by=pl.all()) shape: (8, 4) ┌───────────┬───────┬───────────┬───────┐ │ Species ┆ part ┆ dimension ┆ value │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 │ ╞═══════════╪═══════╪═══════════╪═══════╡ + │ setosa ┆ Petal ┆ Length ┆ 1.4 │ + │ setosa ┆ Petal ┆ Width ┆ 0.2 │ │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ - │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ - │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ - │ setosa ┆ Petal ┆ Length ┆ 1.4 │ │ virginica ┆ Petal ┆ Length ┆ 5.1 │ - │ setosa ┆ Petal ┆ Width ┆ 0.2 │ │ virginica ┆ Petal ┆ Width ┆ 1.8 │ + │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ + │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ └───────────┴───────┴───────────┴───────┘ Retain parts of the column names as headers: @@ -199,7 +199,7 @@ def pivot_longer( ... index = 'Species', ... names_to = ('part', '.value'), ... names_sep = '.', - ... ).select('Species','part','Length','Width') + ... ).select('Species','part','Length','Width').sort(by=pl.all()) shape: (4, 4) ┌───────────┬───────┬────────┬───────┐ │ Species ┆ part ┆ Length ┆ Width │ @@ -207,8 +207,8 @@ def pivot_longer( │ str ┆ str ┆ f64 ┆ f64 │ ╞═══════════╪═══════╪════════╪═══════╡ │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ - │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ └───────────┴───────┴────────┴───────┘ From 527893630cfcb61e21a4647bfe3598d2c3f35d4a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 22:49:26 +1000 Subject: [PATCH 04/21] fix doc --- janitor/polars/dataframe.py | 10 +++++----- janitor/polars/lazyframe.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/janitor/polars/dataframe.py b/janitor/polars/dataframe.py index ca0feb8ca..d1423a2b8 100644 --- a/janitor/polars/dataframe.py +++ b/janitor/polars/dataframe.py @@ -155,21 +155,21 @@ def pivot_longer( └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): - >>> df.janitor.pivot_longer(index = 'Species') + >>> df.janitor.pivot_longer(index = 'Species').sort(by=pl.all()) shape: (8, 3) ┌───────────┬──────────────┬───────┐ │ Species ┆ variable ┆ value │ │ --- ┆ --- ┆ --- │ │ str ┆ str ┆ f64 │ ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ │ virginica ┆ Petal.Width ┆ 1.8 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ └───────────┴──────────────┴───────┘ Split the column labels into individual columns: diff --git a/janitor/polars/lazyframe.py b/janitor/polars/lazyframe.py index f059ab1f5..cd20b2f5a 100644 --- a/janitor/polars/lazyframe.py +++ b/janitor/polars/lazyframe.py @@ -154,21 +154,21 @@ def pivot_longer( │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - >>> df.janitor.pivot_longer(index = 'Species').collect() + >>> df.janitor.pivot_longer(index = 'Species').sort(by=pl.all()).collect() shape: (8, 3) ┌───────────┬──────────────┬───────┐ │ Species ┆ variable ┆ value │ │ --- ┆ --- ┆ --- │ │ str ┆ str ┆ f64 │ ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ │ virginica ┆ Petal.Width ┆ 1.8 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ └───────────┴──────────────┴───────┘ !!! info "New in version 0.28.0" From e8c3057ac526182eeda0ff4f85d81c42ed3dfebe Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 22:57:11 +1000 Subject: [PATCH 05/21] fix doc pivot_longer_spec --- janitor/polars/pivot_longer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index aaf66ff8f..49da64ea2 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -75,7 +75,7 @@ def pivot_longer_spec( │ Sepal.Width ┆ Width ┆ Sepal │ │ Petal.Width ┆ Width ┆ Petal │ └──────────────┴────────┴───────┘ - >>> df.pipe(pivot_longer_spec,spec=spec) + >>> df.pipe(pivot_longer_spec,spec=spec).sort*by=pl.all()) shape: (4, 4) ┌───────────┬───────┬────────┬───────┐ │ Species ┆ part ┆ Length ┆ Width │ @@ -83,8 +83,8 @@ def pivot_longer_spec( │ str ┆ str ┆ f64 ┆ f64 │ ╞═══════════╪═══════╪════════╪═══════╡ │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ - │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ └───────────┴───────┴────────┴───────┘ From 7c497cd3e97628574ffa4fad60501e9dc46bdf98 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 23:02:10 +1000 Subject: [PATCH 06/21] fix doc pivot_longer_spec --- janitor/polars/pivot_longer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 49da64ea2..224604131 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -75,7 +75,7 @@ def pivot_longer_spec( │ Sepal.Width ┆ Width ┆ Sepal │ │ Petal.Width ┆ Width ┆ Petal │ └──────────────┴────────┴───────┘ - >>> df.pipe(pivot_longer_spec,spec=spec).sort*by=pl.all()) + >>> df.pipe(pivot_longer_spec,spec=spec).sort(by=pl.all()) shape: (4, 4) ┌───────────┬───────┬────────┬───────┐ │ Species ┆ part ┆ Length ┆ Width │ From 513fe73067c5e76b08e28a4d6e6b0679804ed0b6 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:10:29 +1000 Subject: [PATCH 07/21] updates --- janitor/polars/pivot_longer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index c1edf235f..b25974eae 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -217,7 +217,7 @@ def pivot_longer( └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): - >>> df.pivot_longer(index = 'Species') + >>> df.janitor.pivot_longer(index = 'Species').sort(by=pl.all()) shape: (8, 3) ┌───────────┬──────────────┬───────┐ │ Species ┆ variable ┆ value │ @@ -239,7 +239,7 @@ def pivot_longer( ... index = 'Species', ... names_to = ('part', 'dimension'), ... names_sep = '.', - ... ).select('Species','part','dimension','value') + ... ).select('Species','part','dimension','value').sort(by=pl.all()) shape: (8, 4) ┌───────────┬───────┬───────────┬───────┐ │ Species ┆ part ┆ dimension ┆ value │ @@ -261,7 +261,7 @@ def pivot_longer( ... index = 'Species', ... names_to = ('part', '.value'), ... names_sep = '.', - ... ).select('Species','part','Length','Width') + ... ).select('Species','part','Length','Width').sort(by=pl.all()) shape: (4, 4) ┌───────────┬───────┬────────┬───────┐ │ Species ┆ part ┆ Length ┆ Width │ From 23994846eafcd666f7dc0fd17c2a679a5ad893fc Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:14:49 +1000 Subject: [PATCH 08/21] updates --- janitor/polars/pivot_longer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index b25974eae..c11b71005 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -217,7 +217,7 @@ def pivot_longer( └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): - >>> df.janitor.pivot_longer(index = 'Species').sort(by=pl.all()) + >>> df.pivot_longer(index = 'Species').sort(by=pl.all()) shape: (8, 3) ┌───────────┬──────────────┬───────┐ │ Species ┆ variable ┆ value │ From 49fc6384fa1b2117ddc3baafade7102368d1b169 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:19:01 +1000 Subject: [PATCH 09/21] updates --- janitor/polars/pivot_longer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index c11b71005..b07964bbb 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -224,14 +224,14 @@ def pivot_longer( │ --- ┆ --- ┆ --- │ │ str ┆ str ┆ f64 │ ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ │ virginica ┆ Petal.Width ┆ 1.8 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ └───────────┴──────────────┴───────┘ Split the column labels into individual columns: From 610794816140a0d62073b63f09bd687eeb07bfaf Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:26:12 +1000 Subject: [PATCH 10/21] fix docs --- janitor/polars/pivot_longer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index b07964bbb..e8fd7f572 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -246,14 +246,14 @@ def pivot_longer( │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 │ ╞═══════════╪═══════╪═══════════╪═══════╡ + │ setosa ┆ Petal ┆ Length ┆ 1.4 │ + │ setosa ┆ Petal ┆ Width ┆ 0.2 │ │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ - │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ - │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ - │ setosa ┆ Petal ┆ Length ┆ 1.4 │ │ virginica ┆ Petal ┆ Length ┆ 5.1 │ - │ setosa ┆ Petal ┆ Width ┆ 0.2 │ │ virginica ┆ Petal ┆ Width ┆ 1.8 │ + │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ + │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ └───────────┴───────┴───────────┴───────┘ Retain parts of the column names as headers: @@ -268,10 +268,10 @@ def pivot_longer( │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ f64 ┆ f64 │ ╞═══════════╪═══════╪════════╪═══════╡ - │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ - │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ + │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ + │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ └───────────┴───────┴────────┴───────┘ Split the column labels based on regex: From f2b956b61733771e41adecbc040736521dd2cdc4 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:35:15 +1000 Subject: [PATCH 11/21] fix tests --- tests/polars/functions/test_pivot_longer_polars.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py index 76a09efea..d2942c9fc 100644 --- a/tests/polars/functions/test_pivot_longer_polars.py +++ b/tests/polars/functions/test_pivot_longer_polars.py @@ -114,7 +114,7 @@ def test_names_pat_str(df_checks): and .value is present. """ result = ( - df_checks.janitor.pivot_longer( + df_checks.pivot_longer( index=["famid", "birth"], names_to=(".value", "age"), names_pattern="(.+)(.)", @@ -254,7 +254,7 @@ def test_names_pattern_dot_value(test_df): """Test output for names_pattern and .value.""" result = ( - test_df.janitor.pivot_longer( + test_df.pivot_longer( column_names=cs.all(), names_to=["set", ".value"], names_pattern="(.+)_(.+)", @@ -269,7 +269,7 @@ def test_names_sep_dot_value(test_df): """Test output for names_pattern and .value.""" result = ( - test_df.janitor.pivot_longer( + test_df.pivot_longer( column_names=cs.all(), names_to=["set", ".value"], names_sep="_", @@ -480,7 +480,7 @@ def test_names_pattern_single_column_not_dot_value1(single_val): """ result = ( single_val.select("x1") - .janitor.pivot_longer(names_to="yA", names_pattern="(.+)") + .pivot_longer(names_to="yA", names_pattern="(.+)") .select("yA", "value") ) From d849cff9dd74409b3b07218b899e8bd1af75fcff Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 22 Jun 2024 12:45:40 +1000 Subject: [PATCH 12/21] change sort logic for `complete` --- janitor/polars/complete.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index ef098ede7..062fc3afd 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -110,7 +110,7 @@ def complete( >>> with pl.Config(tbl_rows=-1): ... df.complete( ... "group", - ... pl.struct("item_id", "item_name").unique().sort().alias("rar"), + ... pl.struct("item_id", "item_name").unique().alias("rar"), ... sort=True ... ) shape: (8, 5) @@ -133,7 +133,7 @@ def complete( >>> with pl.Config(tbl_rows=-1): ... df.complete( ... "group", - ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), + ... pl.struct("item_id", "item_name").unique().alias('rar'), ... fill_value={"value1": 0, "value2": 99}, ... explicit=True, ... sort=True, @@ -159,7 +159,7 @@ def complete( >>> with pl.Config(tbl_rows=-1): ... df.complete( ... "group", - ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), + ... pl.struct("item_id", "item_name").unique().alias('rar'), ... fill_value={"value1": 0, "value2": 99}, ... explicit=False, ... sort=True, @@ -343,13 +343,9 @@ def _complete( for column in columns: if isinstance(column, str): col = pl.col(column).unique() - if sort: - col = col.sort() _columns.append(col) elif cs.is_selector(column): col = column.as_expr().unique() - if sort: - col = col.sort() _columns.append(col) elif isinstance(column, pl.Expr): _columns.append(column) @@ -383,16 +379,35 @@ def _complete( for column in _columns: uniques = uniques.unnest(columns=column) + merge_columns = uniques.columns + if sort: + sort_index = "".join(uniques.columns + df.columns) + sort_index = f"{sort_index}_" + uniques = uniques.with_row_index(name=sort_index) + else: + sort_index = None no_columns_to_fill = set(df.columns) == set(uniques.columns) if fill_value is None or no_columns_to_fill: - return uniques.join(df, on=uniques.columns, how="full", coalesce=True) + if not sort: + return uniques.join( + df, on=merge_columns, how="full", coalesce=True + ) + return ( + uniques.join(df, on=merge_columns, how="full", coalesce=True) + .sort(by=sort_index) + .select(pl.exclude(sort_index)) + ) idx = None columns_to_select = df.columns if not explicit: - idx = "".join(df.columns) + idx = "".join(df.columns + uniques.columns) idx = f"{idx}_" df = df.with_row_index(name=idx) - df = uniques.join(df, on=uniques.columns, how="full", coalesce=True) + else: + idx = None + df = uniques.join(df, on=merge_columns, how="full", coalesce=True) + if sort: + df = df.sort(by=sort_index).select(pl.exclude(sort_index)) # exclude columns that were not used # to generate the combinations exclude_columns = uniques.columns From aee2b09d84a3fc334dc29916261d282576d5239a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 22 Jun 2024 12:57:32 +1000 Subject: [PATCH 13/21] updates to complete --- janitor/polars/complete.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index 062fc3afd..c3cff06e4 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -110,7 +110,7 @@ def complete( >>> with pl.Config(tbl_rows=-1): ... df.complete( ... "group", - ... pl.struct("item_id", "item_name").unique().alias("rar"), + ... pl.struct("item_id", "item_name").unique().sort().alias("rar"), ... sort=True ... ) shape: (8, 5) @@ -133,7 +133,7 @@ def complete( >>> with pl.Config(tbl_rows=-1): ... df.complete( ... "group", - ... pl.struct("item_id", "item_name").unique().alias('rar'), + ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), ... fill_value={"value1": 0, "value2": 99}, ... explicit=True, ... sort=True, @@ -159,7 +159,7 @@ def complete( >>> with pl.Config(tbl_rows=-1): ... df.complete( ... "group", - ... pl.struct("item_id", "item_name").unique().alias('rar'), + ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), ... fill_value={"value1": 0, "value2": 99}, ... explicit=False, ... sort=True, @@ -343,9 +343,13 @@ def _complete( for column in columns: if isinstance(column, str): col = pl.col(column).unique() + if sort: + col = col.sort() _columns.append(col) elif cs.is_selector(column): col = column.as_expr().unique() + if sort: + col = col.sort() _columns.append(col) elif isinstance(column, pl.Expr): _columns.append(column) @@ -354,7 +358,7 @@ def _complete( f"The argument passed to the columns parameter " "should either be a string, a column selector, " "or a polars expression, instead got - " - f"{type(column)}." + f"{type(column).__name__}." ) by_does_not_exist = by is None if by_does_not_exist: From 6a5f66e220a7a12bbe94fdea0d54930fc296d79a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 22 Jun 2024 15:35:42 +1000 Subject: [PATCH 14/21] restore inital setup for complete --- janitor/polars/complete.py | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index c3cff06e4..ef098ede7 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -358,7 +358,7 @@ def _complete( f"The argument passed to the columns parameter " "should either be a string, a column selector, " "or a polars expression, instead got - " - f"{type(column).__name__}." + f"{type(column)}." ) by_does_not_exist = by is None if by_does_not_exist: @@ -383,35 +383,16 @@ def _complete( for column in _columns: uniques = uniques.unnest(columns=column) - merge_columns = uniques.columns - if sort: - sort_index = "".join(uniques.columns + df.columns) - sort_index = f"{sort_index}_" - uniques = uniques.with_row_index(name=sort_index) - else: - sort_index = None no_columns_to_fill = set(df.columns) == set(uniques.columns) if fill_value is None or no_columns_to_fill: - if not sort: - return uniques.join( - df, on=merge_columns, how="full", coalesce=True - ) - return ( - uniques.join(df, on=merge_columns, how="full", coalesce=True) - .sort(by=sort_index) - .select(pl.exclude(sort_index)) - ) + return uniques.join(df, on=uniques.columns, how="full", coalesce=True) idx = None columns_to_select = df.columns if not explicit: - idx = "".join(df.columns + uniques.columns) + idx = "".join(df.columns) idx = f"{idx}_" df = df.with_row_index(name=idx) - else: - idx = None - df = uniques.join(df, on=merge_columns, how="full", coalesce=True) - if sort: - df = df.sort(by=sort_index).select(pl.exclude(sort_index)) + df = uniques.join(df, on=uniques.columns, how="full", coalesce=True) # exclude columns that were not used # to generate the combinations exclude_columns = uniques.columns From 8ea3f5622c58cf5d936a4709457675da7235a7b6 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 22 Jun 2024 20:15:11 +1000 Subject: [PATCH 15/21] remove dead code --- janitor/polars/pivot_longer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index e8fd7f572..37f937627 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -538,8 +538,6 @@ def _pivot_longer_create_spec( spec = spec.with_columns( pl.col(variable_name).struct.rename_fields(names=names_to) ) - if ".value" not in names_to: - return spec.get_column(variable_name) not_dot_value = [name for name in names_to if name != ".value"] spec = spec.unnest(variable_name) if not_dot_value: From cf350a38a7cf62a387a462fd2effc49e47100101 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 24 Jun 2024 07:13:01 +1000 Subject: [PATCH 16/21] use left join --- janitor/polars/complete.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index ef098ede7..546f903bc 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -385,14 +385,14 @@ def _complete( no_columns_to_fill = set(df.columns) == set(uniques.columns) if fill_value is None or no_columns_to_fill: - return uniques.join(df, on=uniques.columns, how="full", coalesce=True) + return uniques.join(df, on=uniques.columns, how="left", coalesce=True) idx = None columns_to_select = df.columns if not explicit: idx = "".join(df.columns) idx = f"{idx}_" df = df.with_row_index(name=idx) - df = uniques.join(df, on=uniques.columns, how="full", coalesce=True) + df = uniques.join(df, on=uniques.columns, how="left", coalesce=True) # exclude columns that were not used # to generate the combinations exclude_columns = uniques.columns From 8fe093cb83901f25bb7b46e1246b9a55c61feba7 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Wed, 26 Jun 2024 22:37:19 +1000 Subject: [PATCH 17/21] update docs for pivot_longer --- janitor/polars/pivot_longer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 37f937627..9dea2581f 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -33,7 +33,7 @@ def pivot_longer_spec( becomes variables. It can come in handy for situations where - `janitor.polars.pivot_longer` + [`pivot_longer`][janitor.polars.pivot_longer.pivot_longer] seems inadequate for the transformation. !!! info "New in version 0.28.0" @@ -187,8 +187,11 @@ def pivot_longer( All measured variables are *unpivoted* (and typically duplicated) along the row axis. + If `names_pattern`, use a valid regular expression pattern containing at least + one capture group, compatible with the [regex crate](https://docs.rs/regex/latest/regex/). + For more granular control on the unpivoting, have a look at - `pivot_longer_spec`. + [`pivot_longer_spec`][janitor.polars.pivot_longer.pivot_longer_spec]. `pivot_longer` can also be applied to a LazyFrame. From 8dd1d82abefe500dfc7339e485586f957f1a0529 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 27 Jun 2024 11:38:27 +1000 Subject: [PATCH 18/21] WIP - expand --- janitor/polars/__init__.py | 2 + janitor/polars/expand.py | 215 +++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 janitor/polars/expand.py diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 1485ad3f2..5ee31ef28 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,5 +1,6 @@ from .clean_names import clean_names, make_clean_names from .complete import complete +from .expand import expand from .pivot_longer import pivot_longer, pivot_longer_spec from .row_to_names import row_to_names @@ -10,4 +11,5 @@ "make_clean_names", "row_to_names", "complete", + "expand", ] diff --git a/janitor/polars/expand.py b/janitor/polars/expand.py new file mode 100644 index 000000000..10a0c8a49 --- /dev/null +++ b/janitor/polars/expand.py @@ -0,0 +1,215 @@ +"""expland implementation for polars.""" + +from __future__ import annotations + +from janitor.utils import check, import_message + +from .polars_flavor import register_dataframe_method, register_lazyframe_method + +try: + import polars as pl + import polars.selectors as cs + from polars.type_aliases import ColumnNameOrSelector +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@register_lazyframe_method +@register_dataframe_method +def expand( + df: pl.DataFrame | pl.LazyFrame, + *columns: tuple[ColumnNameOrSelector], + by: ColumnNameOrSelector = None, + sort: bool = False, +) -> pl.DataFrame | pl.LazyFrame: + """ + Creates a DataFrame from a cartesian combination of all inputs. + + Inspiration is from tidyr's expand() function. + + If `by` is present, the DataFrame is *expanded* per group. + + `expand` can also be applied to a LazyFrame. + + !!! info "New in version 0.28.0" + + Examples: + >>> import pandas as pd + >>> import janitor + >>> data = [{'type': 'apple', 'year': 2010, 'size': 'XS'}, + ... {'type': 'orange', 'year': 2010, 'size': 'S'}, + ... {'type': 'apple', 'year': 2012, 'size': 'M'}, + ... {'type': 'orange', 'year': 2010, 'size': 'S'}, + ... {'type': 'orange', 'year': 2011, 'size': 'S'}, + ... {'type': 'orange', 'year': 2012, 'size': 'M'}] + >>> df = pd.DataFrame(data) + >>> df + type year size + 0 apple 2010 XS + 1 orange 2010 S + 2 apple 2012 M + 3 orange 2010 S + 4 orange 2011 S + 5 orange 2012 M + + Get unique observations: + >>> df.expand('type') + type + 0 apple + 1 orange + >>> df.expand('size') + size + 0 XS + 1 S + 2 M + >>> df.expand('type', 'size') + type size + 0 apple XS + 1 apple S + 2 apple M + 3 orange XS + 4 orange S + 5 orange M + >>> df.expand('type','size','year') + type size year + 0 apple XS 2010 + 1 apple XS 2012 + 2 apple XS 2011 + 3 apple S 2010 + 4 apple S 2012 + 5 apple S 2011 + 6 apple M 2010 + 7 apple M 2012 + 8 apple M 2011 + 9 orange XS 2010 + 10 orange XS 2012 + 11 orange XS 2011 + 12 orange S 2010 + 13 orange S 2012 + 14 orange S 2011 + 15 orange M 2010 + 16 orange M 2012 + 17 orange M 2011 + + Get observations that only occur in the data: + >>> df.expand(['type','size']) + type size + 0 apple XS + 1 orange S + 2 apple M + 3 orange M + >>> df.expand(['type','size','year']) + type size year + 0 apple XS 2010 + 1 orange S 2010 + 2 apple M 2012 + 3 orange S 2011 + 4 orange M 2012 + + Expand the DataFrame to include new observations: + >>> df.expand('type','size',{'new_year':range(2010,2014)}) + type size new_year + 0 apple XS 2010 + 1 apple XS 2011 + 2 apple XS 2012 + 3 apple XS 2013 + 4 apple S 2010 + 5 apple S 2011 + 6 apple S 2012 + 7 apple S 2013 + 8 apple M 2010 + 9 apple M 2011 + 10 apple M 2012 + 11 apple M 2013 + 12 orange XS 2010 + 13 orange XS 2011 + 14 orange XS 2012 + 15 orange XS 2013 + 16 orange S 2010 + 17 orange S 2011 + 18 orange S 2012 + 19 orange S 2013 + 20 orange M 2010 + 21 orange M 2011 + 22 orange M 2012 + 23 orange M 2013 + + Filter for missing observations: + >>> combo = df.expand('type','size','year') + >>> anti_join = df.merge(combo, how='right', indicator=True) + >>> anti_join.query("_merge=='right_only").drop(columns="_merge") + type year size + 1 apple 2012 XS + 2 apple 2011 XS + 3 apple 2010 S + 4 apple 2012 S + 5 apple 2011 S + 6 apple 2010 M + 8 apple 2011 M + 9 orange 2010 XS + 10 orange 2012 XS + 11 orange 2011 XS + 14 orange 2012 S + 16 orange 2010 M + 18 orange 2011 M + + Expand within each group, using `by`: + >>> df.expand('year','size',by='type') + year size + type + apple 2010 XS + apple 2010 M + apple 2012 XS + apple 2012 M + orange 2010 S + orange 2010 M + orange 2011 S + orange 2011 M + orange 2012 S + orange 2012 M + + Args: + df: A pandas DataFrame/LazyFrame. + columns: Specification of columns to expand. + by: If present, the DataFrame is expanded per group. + + Returns: + A polars DataFrame/LazyFrame. + """ + if not columns: + return df + check("sort", sort, [bool]) + _columns = [] + for column in columns: + if isinstance(column, str): + col = pl.col(column) + if sort: + col = col.sort() + _columns.append(col.implode()) + elif cs.is_selector(column): + col = column.as_expr() + if sort: + col = col.sort() + _columns.append(col.implode()) + elif isinstance(column, (pl.Expr, pl.Series)): + _columns.append(column) + else: + raise TypeError( + f"The argument passed to the columns parameter " + "should either be a string, a column selector, " + "or a polars expression, instead got - " + f"{type(column)}." + ) + by_does_not_exist = by is None + if by_does_not_exist: + df = df.select(_columns) + else: + df = df.group_by(by, maintain_order=sort).agg(_columns) + for column in df.columns: + df = df.explode(column) + return df From 83296d1f4cf90c7295c3d434331c42dcfc8d9586 Mon Sep 17 00:00:00 2001 From: Samuel Oranyeli Date: Thu, 27 Jun 2024 12:40:22 +1000 Subject: [PATCH 19/21] Delete janitor/polars/expand.py --- janitor/polars/expand.py | 215 --------------------------------------- 1 file changed, 215 deletions(-) delete mode 100644 janitor/polars/expand.py diff --git a/janitor/polars/expand.py b/janitor/polars/expand.py deleted file mode 100644 index 10a0c8a49..000000000 --- a/janitor/polars/expand.py +++ /dev/null @@ -1,215 +0,0 @@ -"""expland implementation for polars.""" - -from __future__ import annotations - -from janitor.utils import check, import_message - -from .polars_flavor import register_dataframe_method, register_lazyframe_method - -try: - import polars as pl - import polars.selectors as cs - from polars.type_aliases import ColumnNameOrSelector -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -@register_lazyframe_method -@register_dataframe_method -def expand( - df: pl.DataFrame | pl.LazyFrame, - *columns: tuple[ColumnNameOrSelector], - by: ColumnNameOrSelector = None, - sort: bool = False, -) -> pl.DataFrame | pl.LazyFrame: - """ - Creates a DataFrame from a cartesian combination of all inputs. - - Inspiration is from tidyr's expand() function. - - If `by` is present, the DataFrame is *expanded* per group. - - `expand` can also be applied to a LazyFrame. - - !!! info "New in version 0.28.0" - - Examples: - >>> import pandas as pd - >>> import janitor - >>> data = [{'type': 'apple', 'year': 2010, 'size': 'XS'}, - ... {'type': 'orange', 'year': 2010, 'size': 'S'}, - ... {'type': 'apple', 'year': 2012, 'size': 'M'}, - ... {'type': 'orange', 'year': 2010, 'size': 'S'}, - ... {'type': 'orange', 'year': 2011, 'size': 'S'}, - ... {'type': 'orange', 'year': 2012, 'size': 'M'}] - >>> df = pd.DataFrame(data) - >>> df - type year size - 0 apple 2010 XS - 1 orange 2010 S - 2 apple 2012 M - 3 orange 2010 S - 4 orange 2011 S - 5 orange 2012 M - - Get unique observations: - >>> df.expand('type') - type - 0 apple - 1 orange - >>> df.expand('size') - size - 0 XS - 1 S - 2 M - >>> df.expand('type', 'size') - type size - 0 apple XS - 1 apple S - 2 apple M - 3 orange XS - 4 orange S - 5 orange M - >>> df.expand('type','size','year') - type size year - 0 apple XS 2010 - 1 apple XS 2012 - 2 apple XS 2011 - 3 apple S 2010 - 4 apple S 2012 - 5 apple S 2011 - 6 apple M 2010 - 7 apple M 2012 - 8 apple M 2011 - 9 orange XS 2010 - 10 orange XS 2012 - 11 orange XS 2011 - 12 orange S 2010 - 13 orange S 2012 - 14 orange S 2011 - 15 orange M 2010 - 16 orange M 2012 - 17 orange M 2011 - - Get observations that only occur in the data: - >>> df.expand(['type','size']) - type size - 0 apple XS - 1 orange S - 2 apple M - 3 orange M - >>> df.expand(['type','size','year']) - type size year - 0 apple XS 2010 - 1 orange S 2010 - 2 apple M 2012 - 3 orange S 2011 - 4 orange M 2012 - - Expand the DataFrame to include new observations: - >>> df.expand('type','size',{'new_year':range(2010,2014)}) - type size new_year - 0 apple XS 2010 - 1 apple XS 2011 - 2 apple XS 2012 - 3 apple XS 2013 - 4 apple S 2010 - 5 apple S 2011 - 6 apple S 2012 - 7 apple S 2013 - 8 apple M 2010 - 9 apple M 2011 - 10 apple M 2012 - 11 apple M 2013 - 12 orange XS 2010 - 13 orange XS 2011 - 14 orange XS 2012 - 15 orange XS 2013 - 16 orange S 2010 - 17 orange S 2011 - 18 orange S 2012 - 19 orange S 2013 - 20 orange M 2010 - 21 orange M 2011 - 22 orange M 2012 - 23 orange M 2013 - - Filter for missing observations: - >>> combo = df.expand('type','size','year') - >>> anti_join = df.merge(combo, how='right', indicator=True) - >>> anti_join.query("_merge=='right_only").drop(columns="_merge") - type year size - 1 apple 2012 XS - 2 apple 2011 XS - 3 apple 2010 S - 4 apple 2012 S - 5 apple 2011 S - 6 apple 2010 M - 8 apple 2011 M - 9 orange 2010 XS - 10 orange 2012 XS - 11 orange 2011 XS - 14 orange 2012 S - 16 orange 2010 M - 18 orange 2011 M - - Expand within each group, using `by`: - >>> df.expand('year','size',by='type') - year size - type - apple 2010 XS - apple 2010 M - apple 2012 XS - apple 2012 M - orange 2010 S - orange 2010 M - orange 2011 S - orange 2011 M - orange 2012 S - orange 2012 M - - Args: - df: A pandas DataFrame/LazyFrame. - columns: Specification of columns to expand. - by: If present, the DataFrame is expanded per group. - - Returns: - A polars DataFrame/LazyFrame. - """ - if not columns: - return df - check("sort", sort, [bool]) - _columns = [] - for column in columns: - if isinstance(column, str): - col = pl.col(column) - if sort: - col = col.sort() - _columns.append(col.implode()) - elif cs.is_selector(column): - col = column.as_expr() - if sort: - col = col.sort() - _columns.append(col.implode()) - elif isinstance(column, (pl.Expr, pl.Series)): - _columns.append(column) - else: - raise TypeError( - f"The argument passed to the columns parameter " - "should either be a string, a column selector, " - "or a polars expression, instead got - " - f"{type(column)}." - ) - by_does_not_exist = by is None - if by_does_not_exist: - df = df.select(_columns) - else: - df = df.group_by(by, maintain_order=sort).agg(_columns) - for column in df.columns: - df = df.explode(column) - return df From f1fab2eafebfa572c3d2f77509e743ef7af31a42 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 27 Jun 2024 12:48:26 +1000 Subject: [PATCH 20/21] remove expand --- janitor/polars/__init__.py | 2 -- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 5ee31ef28..1485ad3f2 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,6 +1,5 @@ from .clean_names import clean_names, make_clean_names from .complete import complete -from .expand import expand from .pivot_longer import pivot_longer, pivot_longer_spec from .row_to_names import row_to_names @@ -11,5 +10,4 @@ "make_clean_names", "row_to_names", "complete", - "expand", ] diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..d80aaa4fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 0 ignore-init-method = true ignore-init-module = true ignore-module = false From 2b986145fc6a8e17258ba0ed509a92647f1cb7d6 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 27 Jun 2024 12:48:32 +1000 Subject: [PATCH 21/21] remove expand --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d80aaa4fa..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 0 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false