diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml deleted file mode 100644 index 4fd374ca4..000000000 --- a/.github/workflows/codecov.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Run test suite on latest dev commits - -on: - push: - branches: - - dev - -jobs: - run-tests: - runs-on: ubuntu-latest - name: Run test suite on latest dev commits - - # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell - defaults: - run: - shell: bash -l {0} - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - # See: https://github.com/marketplace/actions/setup-miniconda - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniforge-variant: Mambaforge - channels: conda-forge - activate-environment: pyjanitor-dev - environment-file: environment-dev.yml - use-mamba: true - - - name: Run unit tests - run: | - conda activate pyjanitor-dev - python -m pip install -e . - pytest - - # https://github.com/codecov/codecov-action - - name: Upload code coverage - uses: codecov/codecov-action@v2 - with: - # fail_ci_if_error: true # optional (default = false) - verbose: true # optional (default = false) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 09460fc65..00c1d334f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,6 +1,12 @@ name: tests -on: [pull_request] +on: + push: + branches: + - dev + pull_request: + branches: + - dev concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/CHANGELOG.md b/CHANGELOG.md index c0eb52bc0..e7c579944 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,21 +6,20 @@ - [DOC] Updated developer guide docs. - [ENH] Allow column selection/renaming within conditional_join. Issue #1102. Also allow first or last match. Issue #1020 @samukweku. - [ENH] New decorator `deprecated_kwargs` for breaking API. #1103 @Zeroto521 -- [ENH] Extend select_columns to support non-string columns. Also allow selection on MultiIndex columns via level parameter. Issue #1105 @samukweku +- [ENH] Extend select_columns to support non-string columns. Issue #1105 @samukweku - [ENH] Performance improvement for groupby_topk. Issue #1093 @samukweku - [ENH] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521 - [ENH] Add `jointly` option for `min_max_scale` support to transform each column values or entire values. Default transform each column, similar behavior to `sklearn.preprocessing.MinMaxScaler`. (Issue #1067, PR #1112, PR #1123) @Zeroto521 - [INF] Require pyspark minimal version is v3.2.0 to cut duplicates codes. Issue #1110 @Zeroto521 -- [ENH] Added support for extension arrays in `expand_grid`. Issue #1121 @samukweku +- [ENH] Add support for extension arrays in `expand_grid`. Issue #1121 @samukweku - [ENH] Add `names_expand` and `index_expand` parameters to `pivot_wider` for exposing missing categoricals. Issue #1108 @samukweku -- [ENH] Add fix for slicing error when selecting columns in `pivot_wider`. Issue #1134 @samukweku +- [ENH] Add fix for slicing error when selecting columns in `pivot_wider`. Issue #1134 @samukweku - [ENH] `dropna` parameter added to `pivot_longer`. Issue #1132 @samukweku - [INF] Update `mkdocstrings` version and to fit its new coming features. PR #1138 @Zeroto521 - [BUG] Force `math.softmax` returning `Series`. PR #1139 @Zeroto521 - [INF] Set independent environment for building documentation. PR #1141 @Zeroto521 - [DOC] Add local documentation preview via github action artifact. PR #1149 @Zeroto521 - [ENH] Enable `encode_categorical` handle 2 (or more ) dimensions array. PR #1153 @Zeroto521 -- [ENH] Faster computation for a single non-equi join, with a numba engine. Issue #1102 @samukweku - [TST] Fix testcases failing on Window. Issue #1160 @Zeroto521, and @samukweku - [INF] Cancel old workflow runs via Github Action `concurrency`. PR #1161 @Zeroto521 - [ENH] Faster computation for non-equi join, with a numba engine. Speed improvement for left/right joins when `sort_by_appearance` is False. Issue #1102 @samukweku @@ -29,8 +28,12 @@ - [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku - [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku - [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521 +- [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku - [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521 - [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521 +- [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521 +- [TST] Fix failure for test/timeseries/test_fill_missing_timestamp. Issue #1184 @samukweku +- [BUG] Import `DataDescription` to fix: `AttributeError: 'DataFrame' object has no attribute 'data_description'`. PR #1191 @Zeroto521 - [INF] Set a series of complete testing envs. Issue #1127 @Zeroto521 ## [v0.23.1] - 2022-05-03 diff --git a/examples/notebooks/select_columns.ipynb b/examples/notebooks/select_columns.ipynb index 8022e6c54..88a9e1a37 100644 --- a/examples/notebooks/select_columns.ipynb +++ b/examples/notebooks/select_columns.ipynb @@ -433,7 +433,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.10" + "version": "3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) \n[GCC 10.3.0]" }, "orig_nbformat": 4 }, diff --git a/janitor/accessors/__init__.py b/janitor/accessors/__init__.py index 6f9e707c3..038bcd9bc 100644 --- a/janitor/accessors/__init__.py +++ b/janitor/accessors/__init__.py @@ -1,17 +1,3 @@ -"""Miscellaneous mathematical operators. +"""Miscellaneous mathematical operators.""" -Lazy loading used here to speed up imports. -""" - -import warnings -from typing import Tuple - - -import lazy_loader as lazy - -scipy_special = lazy.load("scipy.special") -ss = lazy.load("scipy.stats") -pf = lazy.load("pandas_flavor") -pd = lazy.load("pandas") -np = lazy.load("numpy") -pdtypes = lazy.load("pandas.api.types") +from janitor.accessors.data_description import DataDescription # noqa: F401 diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index 9f2d58531..860a38e23 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -64,7 +64,7 @@ from .reorder_columns import reorder_columns from .round_to_fraction import round_to_fraction from .row_to_names import row_to_names -from .select_columns import select_columns +from .select import select_columns, select_rows from .shuffle import shuffle from .sort_column_value_order import sort_column_value_order from .sort_naturally import sort_naturally diff --git a/janitor/functions/coalesce.py b/janitor/functions/coalesce.py index d0ac070cf..506b69ddb 100644 --- a/janitor/functions/coalesce.py +++ b/janitor/functions/coalesce.py @@ -4,7 +4,7 @@ import pandas_flavor as pf from janitor.utils import check, deprecated_alias -from janitor.functions.utils import _select_column_names +from janitor.functions.utils import _select_index @pf.register_dataframe_method @@ -95,7 +95,8 @@ def coalesce( "The number of columns to coalesce should be a minimum of 2." ) - column_names = _select_column_names([*column_names], df) + indices = _select_index([*column_names], df, axis="columns") + column_names = df.columns[indices] if target_column_name: check("target_column_name", target_column_name, [str]) @@ -106,7 +107,7 @@ def coalesce( if target_column_name is None: target_column_name = column_names[0] - outcome = df.filter(column_names).bfill(axis="columns").iloc[:, 0] + outcome = df.loc(axis=1)[column_names].bfill(axis="columns").iloc[:, 0] if outcome.hasnans and (default_value is not None): outcome = outcome.fillna(default_value) diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py index 2f8438166..d3da6f8e0 100644 --- a/janitor/functions/conditional_join.py +++ b/janitor/functions/conditional_join.py @@ -47,7 +47,7 @@ def conditional_join( especially if the intervals do not overlap. Column selection in `df_columns` and `right_columns` is possible using the - [`select_columns`][janitor.functions.select_columns.select_columns] syntax. + [`select_columns`][janitor.functions.select.select_columns] syntax. For strictly non-equi joins, involving either `>`, `<`, `>=`, `<=` operators, @@ -143,7 +143,7 @@ def conditional_join( :param keep: Choose whether to return the first match, last match or all matches. Default is `all`. :param use_numba: Use numba, if installed, to accelerate the computation. - Default is `False`. + Applicable only to strictly non-equi joins. Default is `False`. :returns: A pandas DataFrame of the two merged Pandas objects. """ @@ -1214,10 +1214,11 @@ def _cond_join_select_columns(columns: Any, df: pd.DataFrame): Returns a Pandas DataFrame. """ - df = df.select_columns(columns) - if isinstance(columns, dict): + df = df.select_columns([*columns]) df.columns = [columns.get(name, name) for name in df] + else: + df = df.select_columns(columns) return df diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index c0eaa92fa..9761cd721 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.concat import concat_compat from janitor.functions.utils import ( - _select_column_names, + _select_index, _computations_expand_grid, ) from janitor.utils import check @@ -52,7 +52,7 @@ def pivot_longer( row axis. Column selection in `index` and `column_names` is possible using the - [`select_columns`][janitor.functions.select_columns.select_columns] syntax. + [`select_columns`][janitor.functions.select.select_columns] syntax. Example: @@ -382,17 +382,35 @@ def _data_checks_pivot_longer( "when the columns are a MultiIndex." ) + is_multi_index = isinstance(df.columns, pd.MultiIndex) + indices = None if column_names is not None: - if is_list_like(column_names): - column_names = list(column_names) - column_names = _select_column_names(column_names, df) - column_names = list(column_names) + if is_multi_index: + column_names = _check_tuples_multiindex( + df.columns, column_names, "column_names" + ) + else: + if is_list_like(column_names): + column_names = list(column_names) + indices = _select_index(column_names, df, axis="columns") + column_names = df.columns[indices] + if not is_list_like(column_names): + column_names = [column_names] + else: + column_names = list(column_names) if index is not None: - if is_list_like(index): - index = list(index) - index = _select_column_names(index, df) - index = list(index) + if is_multi_index: + index = _check_tuples_multiindex(df.columns, index, "index") + else: + if is_list_like(index): + index = list(index) + indices = _select_index(index, df, axis="columns") + index = df.columns[indices] + if not is_list_like(index): + index = [index] + else: + index = list(index) if index is None: if column_names is None: @@ -1181,7 +1199,7 @@ def pivot_wider( Column selection in `index`, `names_from` and `values_from` is possible using the - [`select_columns`][janitor.functions.select_columns.select_columns] syntax. + [`select_columns`][janitor.functions.select.select_columns] syntax. A ValueError is raised if the combination of the `index` and `names_from` is not unique. @@ -1455,27 +1473,69 @@ def _data_checks_pivot_wider( checking happens. """ + is_multi_index = isinstance(df.columns, pd.MultiIndex) + indices = None if index is not None: - if is_list_like(index): - index = list(index) - index = _select_column_names(index, df) - index = list(index) + if is_multi_index: + if not isinstance(index, list): + raise TypeError( + "For a MultiIndex column, pass a list of tuples " + "to the index argument." + ) + index = _check_tuples_multiindex(df.columns, index, "index") + else: + if is_list_like(index): + index = list(index) + indices = _select_index(index, df, axis="columns") + index = df.columns[indices] + if not is_list_like(index): + index = [index] + else: + index = list(index) if names_from is None: raise ValueError( "pivot_wider() is missing 1 required argument: 'names_from'" ) - if is_list_like(names_from): - names_from = list(names_from) - names_from = _select_column_names(names_from, df) - names_from = list(names_from) + if is_multi_index: + if not isinstance(names_from, list): + raise TypeError( + "For a MultiIndex column, pass a list of tuples " + "to the names_from argument." + ) + names_from = _check_tuples_multiindex( + df.columns, names_from, "names_from" + ) + else: + if is_list_like(names_from): + names_from = list(names_from) + indices = _select_index(names_from, df, axis="columns") + names_from = df.columns[indices] + if not is_list_like(names_from): + names_from = [names_from] + else: + names_from = list(names_from) if values_from is not None: - if is_list_like(values_from): - values_from = list(values_from) - out = _select_column_names(values_from, df) - out = list(out) + if is_multi_index: + if not isinstance(values_from, list): + raise TypeError( + "For a MultiIndex column, pass a list of tuples " + "to the values_from argument." + ) + out = _check_tuples_multiindex( + df.columns, values_from, "values_from" + ) + else: + if is_list_like(values_from): + values_from = list(values_from) + indices = _select_index(values_from, df, axis="columns") + out = df.columns[indices] + if not is_list_like(out): + out = [out] + else: + out = list(out) # hack to align with pd.pivot if values_from == out[0]: values_from = out[0] @@ -1550,3 +1610,27 @@ def _expand(indexer, retain_categories): ordered=indexer.ordered, ) return indexer + + +def _check_tuples_multiindex(indexer, args, param): + """ + Check entries for tuples, + if indexer is a MultiIndex. + + Returns a list of tuples. + """ + all_tuples = (isinstance(arg, tuple) for arg in args) + if not all(all_tuples): + raise TypeError( + f"{param} must be a list of tuples " + "when the columns are a MultiIndex." + ) + + not_found = set(args).difference(indexer) + if any(not_found): + raise KeyError( + f"Tuples {*not_found,} in the {param} " + "argument do not exist in the dataframe's columns." + ) + + return args diff --git a/janitor/functions/select.py b/janitor/functions/select.py new file mode 100644 index 000000000..4c1480934 --- /dev/null +++ b/janitor/functions/select.py @@ -0,0 +1,117 @@ +import pandas_flavor as pf +import pandas as pd +from janitor.utils import deprecated_alias +from janitor.functions.utils import _select + + +@pf.register_dataframe_method +@deprecated_alias(search_cols="search_column_names") +def select_columns( + df: pd.DataFrame, + *args, + invert: bool = False, +) -> pd.DataFrame: + """ + Method-chainable selection of columns. + + It accepts a string, shell-like glob strings `(*string*)`, + regex, slice, array-like object, or a list of the previous options. + + Selection on a MultiIndex on a level, or multiple levels, + is possible with a dictionary. + + This method does not mutate the original DataFrame. + + Optional ability to invert selection of columns available as well. + + !!! Note + The preferred option when selecting columns or rows in a Pandas DataFrame + is with `.loc` or `.iloc` methods, as they are generally performant. + `select_columns` is primarily for convenience. + + Example: + + >>> import pandas as pd + >>> import janitor + >>> df = pd.DataFrame({"col1": [1, 2], "foo": [3, 4], "col2": [5, 6]}) + >>> df + col1 foo col2 + 0 1 3 5 + 1 2 4 6 + >>> df.select_columns("col*") + col1 col2 + 0 1 5 + 1 2 6 + + :param df: A pandas DataFrame. + :param args: Valid inputs include: an exact column name to look for, + a shell-style glob string (e.g. `*_thing_*`), + a regular expression, + a callable, + or variable arguments of all the aforementioned. + A sequence of booleans is also acceptable. + A dictionary can be used for selection on a MultiIndex on different levels. + :param invert: Whether or not to invert the selection. + This will result in the selection of the complement of the columns + provided. + :returns: A pandas DataFrame with the specified columns selected. + """ # noqa: E501 + + return _select(df, args, invert, axis="columns") + + +@pf.register_dataframe_method +def select_rows( + df: pd.DataFrame, + *args, + invert: bool = False, +) -> pd.DataFrame: + """ + Method-chainable selection of rows. + + It accepts a string, shell-like glob strings `(*string*)`, + regex, slice, array-like object, or a list of the previous options. + + Selection on a MultiIndex on a level, or multiple levels, + is possible with a dictionary. + + This method does not mutate the original DataFrame. + + Optional ability to invert selection of rows available as well. + + !!! Note + The preferred option when selecting columns or rows in a Pandas DataFrame + is with `.loc` or `.iloc` methods, as they are generally performant. + `select_rows` is primarily for convenience. + + Example: + + >>> import pandas as pd + >>> import janitor + >>> df = {"col1": [1, 2], "foo": [3, 4], "col2": [5, 6]} + >>> df = pd.DataFrame.from_dict(df, orient='index') + >>> df + 0 1 + col1 1 2 + foo 3 4 + col2 5 6 + >>> df.select_rows("col*") + 0 1 + col1 1 2 + col2 5 6 + + :param df: A pandas DataFrame. + :param args: Valid inputs include: an exact index name to look for, + a shell-style glob string (e.g. `*_thing_*`), + a regular expression, + a callable, + or variable arguments of all the aforementioned. + A sequence of booleans is also acceptable. + A dictionary can be used for selection on a MultiIndex on different levels. + :param invert: Whether or not to invert the selection. + This will result in the selection of the complement of the rows + provided. + :returns: A pandas DataFrame with the specified rows selected. + """ # noqa: E501 + + return _select(df, args, invert, axis="index") diff --git a/janitor/functions/select_columns.py b/janitor/functions/select_columns.py deleted file mode 100644 index 5b5cde761..000000000 --- a/janitor/functions/select_columns.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Implementation of select_columns""" -from typing import Optional, Union -import pandas_flavor as pf -import pandas as pd -from pandas.api.types import is_list_like -from janitor.utils import deprecated_alias, check - -from janitor.functions.utils import _select_column_names - - -@pf.register_dataframe_method -@deprecated_alias(search_cols="search_column_names") -def select_columns( - df: pd.DataFrame, - *args, - level: Optional[Union[int, str]] = None, - invert: bool = False, -) -> pd.DataFrame: - """ - Method-chainable selection of columns. - - Not applicable to MultiIndex columns. - - It accepts a string, shell-like glob strings `(*string*)`, - regex, slice, array-like object, or a list of the previous options. - - This method does not mutate the original DataFrame. - - Optional ability to invert selection of columns available as well. - - Example: - - >>> import pandas as pd - >>> import janitor - >>> df = pd.DataFrame({"col1": [1, 2], "foo": [3, 4], "col2": [5, 6]}) - >>> df - col1 foo col2 - 0 1 3 5 - 1 2 4 6 - >>> df.select_columns("col*") - col1 col2 - 0 1 5 - 1 2 6 - - :param df: A pandas DataFrame. - :param args: Valid inputs include: an exact column name to look for, - a shell-style glob string (e.g., `*_thing_*`), - a regular expression, - a callable which is applicable to each Series in the DataFrame, - or variable arguments of all the aforementioned. - A sequence of booleans is also acceptable. - :param level: Determines which level in the columns should be used for the - column selection. - :param invert: Whether or not to invert the selection. - This will result in the selection of the complement of the columns - provided. - :returns: A pandas DataFrame with the specified columns selected. - """ # noqa: E501 - - # applicable for any - # list-like object (ndarray, Series, pd.Index, ...) - search_column_names = [] - for arg in args: - if is_list_like(arg) and (not isinstance(arg, tuple)): - search_column_names.extend(arg) - else: - search_column_names.append(arg) - if level is not None: - # goal here is to capture the original columns - # trim the df.columns to the specified level only, - # and apply the selection (_select_column_names) - # to get the relevant column labels. - # note that no level is dropped; if there are three levels, - # then three levels are returned, with the specified labels - # selected/deselected. - # A copy of the dataframe is made via set_axis, - # to avoid mutating the original dataframe. - df_columns = df.columns - check("level", level, [int, str]) - full_column_list = df_columns.get_level_values(level) - full_column_list = _select_column_names( - search_column_names, df.set_axis(full_column_list, axis=1) - ) - full_column_list = df_columns.isin(full_column_list, level=level) - full_column_list = df_columns[full_column_list] - else: - full_column_list = _select_column_names(search_column_names, df) - if invert: - return df.drop(columns=full_column_list) - return df.loc[:, full_column_list] diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 597ef3885..e87c33d60 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1,11 +1,20 @@ """Utility functions for all of the functions submodule.""" -from itertools import chain import fnmatch import warnings from collections.abc import Callable as dispatch_callable import re -from typing import Hashable, Iterable, List, Optional, Pattern, Union +from typing import ( + Hashable, + Iterable, + List, + Optional, + Pattern, + Union, + Callable, +) from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray +from pandas.core.common import is_bool_indexer + import pandas as pd from janitor.utils import check, _expand_grid @@ -17,11 +26,12 @@ is_string_dtype, is_categorical_dtype, is_extension_array_dtype, + is_bool_dtype, ) import numpy as np from multipledispatch import dispatch from janitor.utils import check_column -import functools +from functools import singledispatch warnings.simplefilter("always", DeprecationWarning) @@ -211,223 +221,312 @@ def _factorize(df, column_name, suffix, **kwargs): # noqa: F811 return df -@functools.singledispatch -def _select_column_names(columns_to_select, df): +def _is_str_or_cat(index): """ - base function for column selection. - Returns a list of column names. + Check if the column/index is a string, + or categorical with strings. """ - if columns_to_select in df.columns: - return [columns_to_select] - raise KeyError(f"No match was returned for {columns_to_select}.") + if is_categorical_dtype(index): + return is_string_dtype(index.categories) + return is_string_dtype(index) + + +def _select_regex(index, arg, source="regex"): + "Process regex on a Pandas Index" + assert source in ("fnmatch", "regex"), source + try: + if source == "fnmatch": + arg, regex = arg + bools = index.str.match(regex, na=False) + else: + bools = index.str.contains(arg, na=False, regex=True) + if not bools.any(): + raise KeyError(f"No match was returned for '{arg}'") + return bools + except Exception as exc: + raise KeyError(f"No match was returned for '{arg}'") from exc -def _is_str_or_cat(df_columns): - """Check if the column is a string or categorical with strings.""" - if is_string_dtype(df_columns): - return True - if is_categorical_dtype(df_columns): - return is_string_dtype(df_columns.categories) - return False +def _select_callable(arg, func: Callable, axis=None): + """ + Process a callable on a Pandas DataFrame/Index. + """ + bools = func(arg) + bools = np.asanyarray(bools) + if not is_bool_dtype(bools): + raise ValueError( + "The output of the applied callable " + "should be a 1-D boolean array." + ) + if axis: + arg = getattr(arg, axis) + if len(bools) != len(arg): + raise IndexError( + f"The boolean array output from the callable {arg} " + f"has wrong length: " + f"{len(bools)} instead of {len(arg)}" + ) + return bools -@_select_column_names.register(str) # noqa: F811 -def _column_sel_dispatch(columns_to_select, df): # noqa: F811 +@singledispatch +def _select_index(arg, df, axis): """ - Base function for column selection. + Base function for selection on a Pandas Index object. + + Returns either an integer, a slice, + a sequence of booleans, or an array of integers, + that match the exact location of the target. + """ + try: + return getattr(df, axis).get_loc(arg) + except Exception as exc: + raise KeyError(f"No match was returned for {arg}") from exc + + +@_select_index.register(str) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 + """ + Base function for selection on a Pandas Index object. Applies only to strings. It is also applicable to shell-like glob strings, which are supported by `fnmatch`. - A list/pandas Index of matching column names is returned. + + Returns either a sequence of booleans, an integer, + or a slice. """ - df_columns = df.columns - - if _is_str_or_cat(df_columns): - if columns_to_select in df_columns: - return [columns_to_select] - # fix for Github Issue 1160 - outcome = [ - fnmatch.fnmatchcase(column, columns_to_select) for column in df - ] - if not any(outcome): - raise KeyError(f"No match was returned for '{columns_to_select}'.") - return df_columns[outcome] - - if is_datetime64_dtype(df_columns): - timestamp = df_columns.get_loc(columns_to_select) - if not isinstance(timestamp, int): - return df_columns[timestamp] - return [df_columns[timestamp]] - - raise KeyError(f"No match was returned for '{columns_to_select}'.") - - -@_select_column_names.register(re.Pattern) # noqa: F811 -def _column_sel_dispatch(columns_to_select, df): # noqa: F811 + index = getattr(df, axis) + if _is_str_or_cat(index) or is_datetime64_dtype(index): + try: + return index.get_loc(arg) + except KeyError as exc: + if _is_str_or_cat(index): + if isinstance(index, pd.MultiIndex): + index = index.get_level_values(0) + # label selection should be case sensitive + # fix for Github Issue 1160 + # translating to regex solves the case sensitivity + # and also avoids the list comprehension + # not that list comprehension is bad - i'd say it is efficient + # however, the Pandas str.match method used in _select_regex + # could offer more performance, especially if the + # underlying array of the index is a PyArrow string array + return _select_regex( + index, (arg, fnmatch.translate(arg)), source="fnmatch" + ) + raise KeyError(f"No match was returned for '{arg}'") from exc + raise KeyError(f"No match was returned for '{arg}'") + + +@_select_index.register(re.Pattern) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 """ - Base function for column selection. + Base function for selection on a Pandas Index object. Applies only to regular expressions. `re.compile` is required for the regular expression. - A pandas Index of matching column names is returned. - """ - df_columns = df.columns - if _is_str_or_cat(df_columns): - bools = df_columns.str.contains( - columns_to_select, na=False, regex=True - ) - if not bools.any(): - raise KeyError(f"No match was returned for {columns_to_select}.") - return df_columns[bools] - raise KeyError(f"No match was returned for {columns_to_select}.") + Returns an array of booleans. + """ + index = getattr(df, axis) + if isinstance(index, pd.MultiIndex): + index = index.get_level_values(0) + return _select_regex(index, arg) -@_select_column_names.register(slice) # noqa: F811 -def _column_sel_dispatch(columns_to_select, df): # noqa: F811 +@_select_index.register(range) # noqa: F811 +@_select_index.register(slice) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 """ - Base function for column selection. + Base function for selection on a Pandas Index object. Applies only to slices. - The start slice value must be a string/tuple/None, - or exist in the dataframe's columns; - same goes for the stop slice value. - The step slice value should be an integer or None. - A slice, if passed correctly in a Multindex column, - returns a list of tuples across all levels of the - column. - - A pandas Index of matching column names is returned. + Returns a slice object. """ - df_columns = df.columns - filtered_columns = None - start_check = None - stop_check = None - step_check = None - method = None - - if not df_columns.is_unique and not df_columns.is_monotonic_increasing: - raise ValueError( - "Non-unique column labels should be monotonic increasing." - ) - - is_date_column = is_datetime64_dtype(df_columns) - if is_date_column: - if not df_columns.is_monotonic_increasing: + index = getattr(df, axis) + if not index.is_monotonic_increasing: + if not index.is_unique: raise ValueError( - "The column is a DatetimeIndex and should be " - "monotonic increasing." + "Non-unique Index labels should be monotonic increasing." + "Kindly sort the index." ) - method = "nearest" - - start, stop, step = ( - columns_to_select.start, - columns_to_select.stop, - columns_to_select.step, - ) - - step_check = any((step is None, isinstance(step, int))) - if not step_check: - raise ValueError( - "The step value for the slice " - "must either be an integer or `None`." - ) - - if not is_date_column: - start_check = any((start is None, start in df_columns)) - if not start_check: - raise ValueError( - "The start value for the slice must either be `None` " - "or exist in the dataframe's columns." - ) - stop_check = any((stop is None, stop in df_columns)) - if not stop_check: + if is_datetime64_dtype(index): raise ValueError( - "The stop value for the slice must either be `None` " - "or exist in the dataframe's columns." + "The DatetimeIndex should be monotonic increasing." + "Kindly sort the index" ) - if start is None: - start = 0 - else: - start = df_columns.get_loc(start, method=method) - if isinstance(start, slice): - start = start.start - if stop is None: - stop = len(df_columns) + 1 - else: - stop = df_columns.get_loc(stop, method=method) - if isinstance(stop, slice): - stop = stop.stop - 1 - - if start > stop: - filtered_columns = df_columns[slice(stop, start + 1, step)][::-1] - else: - filtered_columns = df_columns[slice(start, stop + 1, step)] - return filtered_columns + return index._convert_slice_indexer(arg, kind="loc") -@_select_column_names.register(dispatch_callable) # noqa: F811 -def _column_sel_dispatch(columns_to_select, df): # noqa: F811 +@_select_index.register(dispatch_callable) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 """ - Base function for column selection. + Base function for selection on a Pandas Index object. Applies only to callables. - The callable is applied to every column in the dataframe. - Either True or False is expected per column. - A pandas Index of matching column names is returned. + The callable is applied to the entire DataFrame. + + Returns an array of booleans. """ - # the function will be applied per series. - # this allows filtration based on the contents of the series - # or based on the name of the series, - # which happens to be a column name as well. - # whatever the case may be, - # the returned values should be a sequence of booleans, - # with at least one True. - filtered_columns = df.apply(columns_to_select) + return _select_callable(df, arg, axis) - if not pd.api.types.is_bool_dtype(filtered_columns): + +@_select_index.register(dict) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 + """ + Base function for selection on a Pandas Index object. + Applies only to a dictionary. + + Returns an array of integers. + """ + level_label = {} + index = getattr(df, axis) + if not isinstance(index, pd.MultiIndex): raise TypeError( - "The output of the applied callable should be a boolean array." + "Index selection with a dictionary " + "applies only to a MultiIndex." ) - if not filtered_columns.any(): - raise KeyError(f"No match was returned for {columns_to_select}.") + all_str = (isinstance(entry, str) for entry in arg) + all_str = all(all_str) + all_int = (isinstance(entry, int) for entry in arg) + all_int = all(all_int) + if not all_str | all_int: + raise TypeError( + "The keys in the dictionary represent the levels " + "in the MultiIndex, and should either be all " + "strings or integers." + ) + for key, value in arg.items(): + if isinstance(value, dispatch_callable): + indexer = index.get_level_values(key) + value = _select_callable(indexer, value) + elif isinstance(value, re.Pattern): + indexer = index.get_level_values(key) + value = _select_regex(indexer, value) + level_label[key] = value + + level_label = { + index._get_level_number(level): label + for level, label in level_label.items() + } + level_label = [ + level_label.get(num, slice(None)) for num in range(index.nlevels) + ] + return index.get_locs(level_label) + + +@_select_index.register(np.ndarray) # noqa: F811 +@_select_index.register(ABCPandasArray) # noqa: F811 +@_select_index.register(ABCExtensionArray) # noqa: F811 +@_select_index.register(pd.Index) # noqa: F811 +@_select_index.register(pd.MultiIndex) # noqa: F811 +@_select_index.register(pd.Series) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 + """ + Base function for selection on a Pandas Index object. + Applies to pd.Series/pd.Index/pd.array/np.ndarray. - return df.columns[filtered_columns] + Returns an array of integers. + """ + index = getattr(df, axis) + + if is_bool_dtype(arg): + if len(arg) != len(index): + raise IndexError( + f"{arg} is a boolean dtype and has wrong length: " + f"{len(arg)} instead of {len(index)}" + ) + return arg + try: + + if isinstance(arg, pd.Series): + arr = arg.array + else: + arr = arg + if isinstance(index, pd.MultiIndex) and not isinstance( + arg, pd.MultiIndex + ): + return index.get_locs([arg]) + arr = index.get_indexer_for(arr) + not_found = arr == -1 + if not_found.all(): + raise KeyError( + f"No match was returned for any of the labels in {arg}" + ) + elif not_found.any(): + not_found = set(arg).difference(index) + raise KeyError( + f"No match was returned for these labels in {arg} - " + f"{*not_found,}" + ) + return arr + except Exception as exc: + raise KeyError(f"No match was returned for {arg}") from exc -@_select_column_names.register(list) # noqa: F811 -def _column_sel_dispatch(columns_to_select, df): # noqa: F811 +@_select_index.register(list) # noqa: F811 +def _index_dispatch(arg, df, axis): # noqa: F811 """ - Base function for column selection. + Base function for selection on a Pandas Index object. Applies only to list type. It can take any of slice, str, callable, re.Pattern types, ..., or a combination of these types. - A list of column names is returned. - """ - if all(map(pd.api.types.is_bool, columns_to_select)): - if len(columns_to_select) != len(df.columns): + Returns an array of integers. + """ + index = getattr(df, axis) + if is_bool_indexer(arg): + if len(arg) != len(index): raise ValueError( "The length of the list of booleans " - f"({len(columns_to_select)}) does not match " - f"the number of columns({df.columns.size}) " - "in the dataframe." + f"({len(arg)}) does not match " + f"the length of the DataFrame's {axis}({index.size})." ) - return df.columns[columns_to_select] - - filtered_columns = ( - _select_column_names(entry, df) for entry in columns_to_select - ) - - filtered_columns = list(chain.from_iterable(filtered_columns)) - - # get rid of possible duplicates - if len(filtered_columns) != len(set(filtered_columns)): - filtered_columns = pd.unique(filtered_columns) + return arg + + indices = [_select_index(entry, df, axis) for entry in arg] + + # single entry does not need to be combined + # or materialized if possible; + # this offers more performance + if len(indices) == 1: + if isinstance(indices[0], int): + return indices + if is_list_like(indices[0]): + return np.asanyarray(indices[0]) + return indices[0] + contents = [] + for arr in indices: + if is_list_like(arr): + arr = np.asanyarray(arr) + if is_bool_dtype(arr): + arr = arr.nonzero()[0] + elif isinstance(arr, slice): + arr = range(index.size)[arr] + elif isinstance(arr, int): + arr = [arr] + contents.append(arr) + contents = np.concatenate(contents) + # remove possible duplicates + return pd.unique(contents) + + +def _select( + df: pd.DataFrame, args: tuple, invert: bool, axis: str +) -> pd.DataFrame: + """ + Index DataFrame on the index or columns. - return filtered_columns + Returns a DataFrame. + """ + indices = _select_index(list(args), df, axis) + if invert: + rev = np.ones(getattr(df, axis).size, dtype=np.bool8) + rev[indices] = False + return df.iloc(axis=axis)[rev] + return df.iloc(axis=axis)[indices] def _convert_to_numpy_array( diff --git a/tests/functions/test_case_when.py b/tests/functions/test_case_when.py index 8971f4e31..aa7b2e516 100644 --- a/tests/functions/test_case_when.py +++ b/tests/functions/test_case_when.py @@ -166,6 +166,7 @@ def test_case_when_replacement_callable(df): @given(df=categoricaldf_strategy()) +@settings(deadline=None) def test_case_when_default_array(df): """ Test case_when for scenarios where `default` is array-like @@ -183,6 +184,7 @@ def test_case_when_default_array(df): @given(df=categoricaldf_strategy()) +@settings(deadline=None) def test_case_when_default_list_like(df): """ Test case_when for scenarios where `default` is list-like, @@ -201,6 +203,7 @@ def test_case_when_default_list_like(df): @given(df=categoricaldf_strategy()) +@settings(deadline=None) def test_case_when_default_index(df): """ Test case_when for scenarios where `default` is an index. diff --git a/tests/functions/test_encode_categorical.py b/tests/functions/test_encode_categorical.py index fd4d6eeb7..84ec60f95 100644 --- a/tests/functions/test_encode_categorical.py +++ b/tests/functions/test_encode_categorical.py @@ -234,6 +234,7 @@ def test_empty_col_sort(df): @pytest.mark.functions @given(df=df_strategy()) +@settings(deadline=None) def test_empty_col_appearance(df): """ Raise ValueError if a string is provided, @@ -257,6 +258,7 @@ def test_empty_col_appearance(df): @pytest.mark.functions @given(df=categoricaldf_strategy()) +@settings(deadline=None) def test_all_None(df): """ Test output where value is None. @@ -269,6 +271,7 @@ def test_all_None(df): @pytest.mark.functions @given(df=categoricaldf_strategy()) +@settings(deadline=None) def test_all_cat_None_1(df): """ Test output where a string is provided. diff --git a/tests/functions/test_expand_grid.py b/tests/functions/test_expand_grid.py index 492958306..f03a427f5 100644 --- a/tests/functions/test_expand_grid.py +++ b/tests/functions/test_expand_grid.py @@ -13,6 +13,7 @@ @given(df=df_strategy()) +@settings(deadline=None) def test_others_not_dict(df): """Raise Error if `others` is not a dictionary.""" with pytest.raises(TypeError): @@ -32,6 +33,7 @@ def test_others_empty(): @given(df=df_strategy()) +@settings(deadline=None) def test_df_key(df): """Raise error if df exists and df_key is not supplied.""" with pytest.raises(KeyError): diff --git a/tests/functions/test_pivot_longer.py b/tests/functions/test_pivot_longer.py index 4ac66f8c5..44e476485 100644 --- a/tests/functions/test_pivot_longer.py +++ b/tests/functions/test_pivot_longer.py @@ -305,6 +305,46 @@ def test_column_names_tuple_multiindex(df_multi): df_multi.pivot_longer(column_names=("names", "aa")) +def test_column_names_missing_multiindex(df_multi): + """ + Raise ValueError if column_names is a list of tuples, + the dataframe's column is a MultiIndex, + and the tuple cannot be found. + """ + with pytest.raises(KeyError): + df_multi.pivot_longer(column_names=[("names", "bb")]) + + +def test_index_missing_multiindex(df_multi): + """ + Raise ValueError if index is a list of tuples, + the dataframe's column is a MultiIndex, + and the tuple cannot be found. + """ + with pytest.raises(KeyError): + df_multi.pivot_longer(index=[("names", "bb")]) + + +def test_column_names_not_all_tuples_multiindex(df_multi): + """ + Raise ValueError if column_names is a list of tuples, + the dataframe's column is a MultiIndex, + and one of the entries is not a tuple. + """ + with pytest.raises(TypeError): + df_multi.pivot_longer(column_names=[("names", "aa"), "a"]) + + +def test_index_not_all_tuples_multiindex(df_multi): + """ + Raise ValueError if index is a list of tuples, + the dataframe's column is a MultiIndex, + and one of the entries is not a tuple. + """ + with pytest.raises(TypeError): + df_multi.pivot_longer(index=[("names", "aa"), "a"]) + + def test_sort_by_appearance(df_checks): """Raise error if sort_by_appearance is not boolean.""" with pytest.raises(TypeError): diff --git a/tests/functions/test_pivot_wider.py b/tests/functions/test_pivot_wider.py index 81d6a9f97..6e2d5ab99 100644 --- a/tests/functions/test_pivot_wider.py +++ b/tests/functions/test_pivot_wider.py @@ -573,3 +573,120 @@ def test_expand_multiple_levels_flatten_levels(df_expand): .reset_index() ) assert_frame_equal(actual, expected) + + +@pytest.fixture +def multi(): + """fixture for MultiIndex column""" + columns = pd.MultiIndex.from_tuples( + [("first", "extra"), ("second", "extra"), ("A", "cat")], + names=["exp", "animal"], + ) + + data = np.array( + [ + ["bar", "one", 0.10771469563752678], + ["bar", "two", -0.6453410828562166], + ["baz", "one", 0.3210232406192864], + ["baz", "two", 2.010694653300755], + ], + dtype=object, + ) + + return pd.DataFrame(data, columns=columns) + + +errors = [ + ["multi", ("first", "extra"), [("second", "extra")], None], + ["multi", [("first", "extra")], ("second", "extra"), None], + ("multi", None, [("second", "extra")], ("A", "cat")), +] + + +@pytest.mark.parametrize( + "multi,index,names_from,values_from", errors, indirect=["multi"] +) +def test_multiindex(multi, index, names_from, values_from): + """ + Raise if df.columns is a MultiIndex + and index/names_from/values_from + is not a list of tuples + """ + with pytest.raises(TypeError): + multi.pivot_wider( + index=index, names_from=names_from, values_from=values_from + ) + + +def test_multiindex_values_from(multi): + """ + Raise if df.columns is a MultiIndex, + values_from is a list of tuples, + and not all entries are tuples + """ + with pytest.raises(TypeError): + multi.pivot_wider( + names_from=[("second", "extra")], values_from=[("A", "cat"), "A"] + ) + + +def test_multiindex_index(multi): + """ + Raise if df.columns is a MultiIndex, + index is a list of tuples, + and not all entries are tuples + """ + with pytest.raises(TypeError): + multi.pivot_wider( + names_from=[("second", "extra")], + index=[("first", "extra"), "first"], + ) + + +def test_multi_index_values_from(multi): + """ + Raise if df.columns is a MultiIndex, + values_from is a list of tuples, + and not all entries are tuples + """ + with pytest.raises(TypeError): + multi.pivot_wider( + names_from=[("second", "extra"), "first"], + values_from=[("A", "cat"), "A"], + ) + + +def test_multiindex_values_from_missing(multi): + """ + Raise if df.columns is a MultiIndex, + values_from is a list of tuples, + and a tuple is missing + """ + with pytest.raises(KeyError): + multi.pivot_wider( + names_from=[("second", "extra")], values_from=[("A", "ct")] + ) + + +def test_multiindex_index_missing(multi): + """ + Raise if df.columns is a MultiIndex, + index is a list of tuples, + and a tuple is missing + """ + with pytest.raises(KeyError): + multi.pivot_wider( + names_from=[("second", "extra")], index=[("first", "ext")] + ) + + +def test_multi_index_values_from_missing(multi): + """ + Raise if df.columns is a MultiIndex, + values_from is a list of tuples, + and a tuple is missing + """ + with pytest.raises(KeyError): + multi.pivot_wider( + names_from=[("sec", "extra")], values_from=[("A", "cat")] + ) diff --git a/tests/functions/test_select_columns.py b/tests/functions/test_select_columns.py index 137170d31..cde32079a 100644 --- a/tests/functions/test_select_columns.py +++ b/tests/functions/test_select_columns.py @@ -1,7 +1,12 @@ import pandas as pd -import pytest +import datetime +import numpy as np import re +import pytest from pandas.testing import assert_frame_equal +from itertools import product + +from janitor.functions.utils import patterns @pytest.mark.functions @@ -33,7 +38,7 @@ def test_select_column_names_glob_inputs(dataframe, invert, expected): columns = ["Bell__Chart", "a*"] df = dataframe.select_columns(columns, invert=invert) - assert_frame_equal(df, dataframe[expected]) + assert_frame_equal(df, dataframe.loc(axis=1)[expected]) @pytest.mark.functions @@ -79,8 +84,8 @@ def test_select_unique_columns(dataframe, invert, expected): def test_select_callable_columns(dataframe, invert, expected): """Test that columns are returned when a callable is passed.""" - def columns(x): - return "-" in x.name or "_" in x.name + def columns(frame): + return frame.columns.str.contains("[-,__]") df = dataframe.select_columns(columns, invert=invert) @@ -88,59 +93,322 @@ def columns(x): @pytest.fixture -def df_tuple(): - "pytest fixture." - frame = pd.DataFrame( +def multiindex(): + """pytest fixture.""" + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + pd.Categorical( + ["one", "two", "one", "two", "one", "two", "one", "two"] + ), + ] + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) + return pd.DataFrame(np.random.randn(4, 8), columns=index) + + +def test_multiindex(multiindex): + """ + Test output for a MultiIndex and tuple passed. + """ + assert_frame_equal( + multiindex.select_columns(("bar", "one")), + multiindex.loc[:, [("bar", "one")]], + ) + + +@pytest.fixture +def df_dates(): + """pytest fixture""" + start = datetime.datetime(2011, 1, 1) + end = datetime.datetime(2012, 1, 1) + rng = pd.date_range(start, end, freq="BM") + return pd.DataFrame([np.random.randn(len(rng))], columns=rng) + + +@pytest.fixture +def df_strings(): + """pytest fixture.""" + return pd.DataFrame( { - "A": {0: "a", 1: "b", 2: "c"}, - "B": {0: 1, 1: 3, 2: 5}, - "C": {0: 2, 1: 4, 2: 6}, + "id": [0, 1], + "Name": ["ABC", "XYZ"], + "code": [1, 2], + "code1": [4, np.nan], + "code2": ["8", 5], + "type": ["S", "R"], + "type1": ["E", np.nan], + "type2": ["T", "U"], + "code3": pd.Series(["a", "b"], dtype="category"), + "type3": pd.to_datetime( + [np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] + ), } ) - frame.columns = [list("ABC"), list("DEF")] - return frame -def test_multiindex(df_tuple): +@pytest.fixture +def numbers(): + """pytest fixture""" + return pd.DataFrame([np.random.randn(20)], columns=range(20)) + + +def test_col_not_found(numbers): """ - Test output for a MultiIndex and tuple passed. + Raise KeyError if the search value is a string, + is not in df.columns, + and df.columns is not date/string/categorical. """ + with pytest.raises(KeyError, match="No match was returned.+"): + numbers.select_columns("sam") + + +def test_col_not_found3(df_dates): + """ + Raise KeyError if the search value is not in df.columns, + and df.columns is a datetime index. + """ + with pytest.raises(KeyError): + df_dates.select_columns("id") + + +def test_strings_cat(df_strings): + """Test output on categorical columns""" + df_strings.columns = df_strings.columns.astype("category") + assert_frame_equal( + df_strings.select_columns("id"), df_strings.loc[:, ["id"]] + ) + assert_frame_equal( + df_strings.select_columns("*type*"), df_strings.filter(like="type") + ) + + +def test_regex(df_strings): + """Test output on regular expressions.""" assert_frame_equal( - df_tuple.select_columns(("A", "D")), df_tuple.loc[:, [("A", "D")]] + df_strings.select_columns(re.compile(r"\d$")), + df_strings.filter(regex=r"\d$"), ) -def test_level_callable(df_tuple): +def test_regex_cat(df_strings): + """Test output on categorical columns""" + df_strings.columns = df_strings.columns.astype("category") + assert_frame_equal( + df_strings.select_columns(re.compile(r"\d$")), + df_strings.filter(regex=r"\d$"), + ) + + +def test_patterns_warning(df_strings): + """ + Check that warning is raised if `janitor.patterns` is used. + """ + with pytest.warns(DeprecationWarning): + assert_frame_equal( + df_strings.select_columns(patterns(r"\d$")), + df_strings.filter(regex=r"\d$"), + ) + + +def test_regex_presence_string_column(df_strings): + """ + Raise KeyError if search_value is a regex + and does not exist in the dataframe's columns. + """ + with pytest.raises(KeyError, match="No match was returned for.+"): + df_strings.select_columns(re.compile("word")) + + +def test_regex_presence(df_dates): + """ + Raise KeyError if search_value is a regex + and the columns is not a string column. + """ + with pytest.raises(KeyError, match=r"No match was returned.+"): + df_dates.select_columns(re.compile(r"^\d+")) + + +def test_slice_unique(): """ - Test output if level is supplied for a callable. + Raise ValueError if the columns are not unique. """ - expected = df_tuple.select_columns( - lambda df: df.name.startswith("A"), level=0 + not_unique = pd.DataFrame([], columns=["code", "code2", "code1", "code"]) + with pytest.raises( + ValueError, + match="Non-unique Index labels should be monotonic increasing.", + ): + not_unique.select_columns(slice("code", "code2")) + + +def test_unsorted_dates_slice(df_dates): + """Raise Error if the date column is unsorted.""" + df_dates = df_dates.iloc[:, ::-1] + with pytest.raises( + ValueError, + match="The DatetimeIndex should be monotonic increasing.", + ): + df_dates.select_columns(slice("2011-01-31", "2011-03-31")) + + +slicers = [ + slice("code", "code2"), + slice("code2", None), + slice(None, "code2"), + slice(None, None), + slice(None, None, 2), +] +slicers = product(["df_strings"], slicers) + + +@pytest.mark.parametrize( + "df_strings, slicer", slicers, indirect=["df_strings"] +) +def test_slice(df_strings, slicer): + """Test output on slices.""" + assert_frame_equal( + df_strings.select_columns(slicer), df_strings.loc[:, slicer] ) - actual = df_tuple.xs("A", axis=1, drop_level=False, level=0) + + +def test_slice_reverse(df_strings): + """ + Test output on a reverse slice + """ + actual = df_strings.select_columns(slice("code2", "code", -1)) + expected = df_strings.loc[ + :, + [ + "code2", + "code1", + "code", + ], + ] + assert_frame_equal(actual, expected) -def test_level_regex(df_tuple): +def test_slice_dates(df_dates): + """Test output of slice on date column.""" + actual = df_dates.select_columns(slice("2011-01-31", "2011-03-31")) + expected = df_dates.loc[:, "2011-01-31":"2011-03-31"] + assert_frame_equal(actual, expected) + + +def test_slice_dates_inexact(df_dates): + """Test output of slice on date column.""" + actual = df_dates.select_columns(slice("2011-01", "2011-03")) + expected = df_dates.loc[:, "2011-01":"2011-03"] + assert_frame_equal(actual, expected) + + +def test_boolean_list_dtypes(df_dates): """ - Test output if level is supplied for a regex + Raise ValueError if the search value + is a list of booleans and the length + is unequal to the number of columns + in the dataframe. """ - expected = df_tuple.select_columns(re.compile("D"), level=1) - actual = df_tuple.xs("D", axis=1, drop_level=False, level=1) + with pytest.raises( + ValueError, match="The length of the list of booleans.+" + ): + df_dates.select_columns([True, False]) + + +def test_list_boolean(df_dates): + """Test output on a list of booleans.""" + booleans = np.repeat([True, False], 6) + actual = df_dates.select_columns(booleans) + expected = df_dates.loc[:, booleans] assert_frame_equal(actual, expected) -def test_level_slice(df_tuple): +def test_number_dates(df_dates): + """Raise if selecting number on a date column""" + with pytest.raises(KeyError, match="No match was returned for 2.5"): + df_dates.select_columns(2.5) + + +def test_callable(numbers): """ - Test output if level is supplied for a slice + Check that error is raised if `columns_to_select` is a + callable, and at lease one Series has a wrong data type + that makes the callable unapplicable. """ - expected = df_tuple.select_columns(slice("F", "D"), level=1) - assert_frame_equal(df_tuple, expected) + with pytest.raises( + ValueError, + match="The output of the applied callable " + "should be a 1-D boolean array.", + ): + numbers.select_columns(lambda df: df + 3) -def test_level_str(df_tuple): +def test_callable_length(numbers): """ - Test output if level is supplied for a string. + Raise if the boolean output from the callable + is not the same as the length of the columns. """ - expected = df_tuple.select_columns("A", level=0, invert=True) - assert_frame_equal(df_tuple.drop(columns="A", axis=1, level=0), expected) + with pytest.raises( + IndexError, match="The boolean array output from the callable.+" + ): + numbers.select_columns(lambda df: [True, False]) + + +def test_dict(multiindex): + """Test output on a dict""" + mapp = {"first": ["bar", "qux"], "second": "two"} + expected = multiindex.select_columns(mapp) + actual = multiindex.loc(axis=1)[["bar", "qux"], "two"] + assert_frame_equal(expected, actual) + + +def test_dict_callable(multiindex): + """Test output on a dict""" + mapp = {"first": ["bar", "qux"], "second": lambda df: df == "two"} + expected = multiindex.select_columns(mapp) + actual = multiindex.loc(axis=1)[["bar", "qux"], "two"] + assert_frame_equal(expected, actual) + + +def test_dict_regex(multiindex): + """Test output on a dict""" + mapp = {"first": ["bar", "qux"], "second": re.compile("tw.")} + expected = multiindex.select_columns(mapp) + actual = multiindex.loc(axis=1)[["bar", "qux"], "two"] + assert_frame_equal(expected, actual) + + +def test_boolean_series_multi(multiindex): + """Test boolean output on a MultiIndex""" + mapp = pd.Series([True, False]).repeat(4) + expected = multiindex.select_columns(mapp, "foo") + actual = multiindex.loc(axis=1)["bar":"foo"] + assert_frame_equal(expected, actual) + + +def test_boolean_list_multi(multiindex): + """Test boolean output on a MultiIndex""" + mapp = [True, True, True, True, False, False, False, False] + expected = multiindex.select_columns(mapp, "foo") + actual = multiindex.loc(axis=1)["bar":"foo"] + assert_frame_equal(expected, actual) + + +def test_series_multi(multiindex): + """Test pd.Series output on a MultiIndex""" + mapp = pd.Series(["bar"]) + expected = multiindex.select_columns(mapp, slice("foo")) + actual = multiindex.loc(axis=1)["bar":"foo"] + assert_frame_equal(expected, actual) + + +def test_glob_multi(multiindex): + """Test fnmatch output on a MultiIndex""" + expected = multiindex.select_columns("b*r") + actual = multiindex.loc(axis=1)[["bar"]] + assert_frame_equal(expected, actual) + + +def test_regex_multi(multiindex): + """Test regex output on a MultiIndex""" + expected = multiindex.select_columns(re.compile("b.r")) + actual = multiindex.loc(axis=1)[["bar"]] + assert_frame_equal(expected, actual) diff --git a/tests/functions/test_select_rows.py b/tests/functions/test_select_rows.py new file mode 100644 index 000000000..841afb77d --- /dev/null +++ b/tests/functions/test_select_rows.py @@ -0,0 +1,283 @@ +import datetime +import re + +import numpy as np +import pandas as pd +import pytest + +from pandas.testing import assert_frame_equal + + +@pytest.fixture +def dates(): + """pytest fixture""" + start = datetime.datetime(2011, 1, 1) + end = datetime.datetime(2012, 1, 1) + rng = pd.date_range(start, end, freq="BM") + return pd.DataFrame({"numbers": np.random.randn(len(rng))}, index=rng) + + +@pytest.fixture +def numbers(): + """pytest fixture""" + return pd.DataFrame({"num": np.random.randn(20)}) + + +@pytest.fixture +def multiindex(): + """pytest fixture.""" + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + pd.Categorical( + ["one", "two", "one", "two", "one", "two", "one", "two"] + ), + ] + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) + return pd.DataFrame(np.random.randn(8, 4), index=index) + + +def test_number_not_found_index(numbers): + """Raise KeyError if passed value is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + numbers.select_rows(2.5) + + +def test_string_not_found_numeric_index(numbers): + """Raise KeyError if passed value is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + numbers.select_rows("2.5") + + +def test_regex_not_found_numeric_index(numbers): + """Raise KeyError if passed value is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + numbers.select_rows(re.compile(".+")) + + +def test_regex_not_found_string_index(multiindex): + """Raise KeyError if passed value is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + multiindex.droplevel("second").select_rows(re.compile("t.+")) + + +def test_date_not_found(dates): + """Raise KeyError if passed value is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + dates.select_rows("2011-01-02") + + +def test_string_not_found_multi(multiindex): + """Raise KeyError if passed string is not found in the MultiIndex.""" + with pytest.raises(KeyError, match="No match was returned.+"): + multiindex.droplevel("second").select_rows("2.5") + + +def test_tuple_not_found(multiindex): + """Raise KeyError if passed tuple is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + multiindex.select_rows(("one", "bar")) + + +def test_list_not_found(numbers): + """Raise KeyError if passed value in list is not found in the index.""" + with pytest.raises(KeyError, match="No match was returned.+"): + numbers.select_rows([2.5, 3]) + + +def test_slice_unique(): + """ + Raise ValueError if the index is not unique. + """ + not_unique = pd.DataFrame([], index=["code", "code2", "code1", "code"]) + with pytest.raises( + ValueError, + match="Non-unique Index labels should be monotonic increasing.", + ): + not_unique.select_rows(slice("code", "code2")) + + +def test_unsorted_dates_slice(dates): + """Raise Error if the dates are unsorted.""" + with pytest.raises( + ValueError, + match="The DatetimeIndex should be monotonic increasing.", + ): + dates.iloc[::-1].select_rows(slice("2011-01-31", "2011-03-31")) + + +def test_boolean_list_uneven_length(dates): + """ + Raise ValueError if `rows` is a list of booleans + and the length is unequal to the length of the dataframe's index + """ + with pytest.raises( + ValueError, match="The length of the list of booleans.+" + ): + dates.select_rows([True, False]) + + +def test_invert_num(numbers): + """Test output when rows are dropped.""" + expected = numbers.select_rows([4, 6, 10], invert=True) + actual = numbers.drop([4, 6, 10]) + assert_frame_equal(expected, actual) + + +def test_date_partial_output(dates): + """Test output on a date""" + expected = dates.select_rows("2011") + actual = dates.loc["2011"] + assert_frame_equal(expected, actual, check_freq=False) + + +def test_date_actual_output(dates): + """Test output on a date""" + expected = dates.select_rows("2011-01-31") + actual = dates.loc[["2011-01-31"]] + assert_frame_equal(expected, actual, check_freq=False) + + +def test_slice_dates(dates): + """Test output of slice on dates.""" + slicer = slice("2011-01-31", "2011-03-31") + expected = dates.select_rows(slicer) + actual = dates.loc[slicer] + assert_frame_equal(expected, actual, check_freq=False) + + +def test_slice_dates_inexact(dates): + """Test output of slice on dates.""" + slicer = slice("2011-01", "2011-03") + expected = dates.select_rows(slicer) + actual = dates.loc[slicer] + assert_frame_equal(expected, actual, check_freq=False) + + +def test_slice1(dates): + """Test output of slice on index.""" + expected = dates.select_rows(slice(None, None)) + assert_frame_equal(expected, dates, check_freq=False) + + +def test_slice2(dates): + """Test output of slice on index.""" + expected = dates.select_rows(slice(None, None, 2)) + assert_frame_equal(expected, dates.loc[::2], check_freq=False) + + +def test_boolean_list(multiindex): + """ + Test output for boolean list + """ + booleans = [True, True, True, False, False, False, True, True] + expected = multiindex.select_rows(booleans) + assert_frame_equal(multiindex.loc[booleans], expected) + + +def test_callable(dates): + """ + Test output for callable + """ + func = lambda df: df.index.month == 4 # noqa : E731 + assert_frame_equal( + dates.loc[func], dates.select_rows(func), check_freq=False + ) + + +def test_multiindex_tuple_present(multiindex): + """ + Test output for a MultiIndex and tuple passed. + """ + assert_frame_equal( + multiindex.select_rows(("bar", "one")), + multiindex.loc[[("bar", "one")]], + ) + + +def test_errors_MultiIndex_dict(multiindex): + """ + Raise if `level` is an int/string + and duplicated + """ + ix = {"second": "one", 0: "bar"} + msg = "The keys in the dictionary represent the levels " + msg += "in the MultiIndex, and should either be all " + msg += "strings or integers." + with pytest.raises(TypeError, match=msg): + multiindex.select_rows(ix) + + +def test_dict(multiindex): + """Test output on a dict""" + mapp = {"first": ["bar", "qux"], "second": "two"} + expected = multiindex.select_rows(mapp) + actual = multiindex.loc(axis=0)[["bar", "qux"], "two"] + assert_frame_equal(expected, actual) + + +def test_boolean_multiindex(multiindex): + """Raise if boolean length does not match index length""" + with pytest.raises(IndexError): + multiindex.select_rows(lambda df: [True, False]) + + +def test_dict_single_index(dates): + """ + Raise if a dictionary is passed, + and the index is not a MultiIndex + """ + with pytest.raises(TypeError): + dates.select_rows({0: "2011-01-31"}) + + +def test_array(dates): + """Test output for pandas array""" + arr = pd.array(["2011-01-31"]) + expected = dates.select_rows(arr) + actual = dates.loc[arr] + assert_frame_equal(expected, actual, check_freq=False) + + +def test_series(dates): + """Test output for pandas Series""" + arr = pd.Series(["2011-01-31"]) + expected = dates.select_rows(arr) + actual = dates.loc[arr] + assert_frame_equal(expected, actual, check_freq=False) + + +def test_numpy_array(dates): + """Test output for pandas array""" + arr = np.array(["2011-01-31"]) + expected = dates.select_rows(arr) + actual = dates.loc[arr] + assert_frame_equal(expected, actual) + + +def test_array_bool(dates): + """Test output for pandas array""" + arr = np.array([True, False]).repeat(6) + expected = dates.select_rows(arr) + actual = dates.loc[arr] + assert_frame_equal(expected, actual) + + +def test_boolean_Index(dates): + """Raise if boolean is not same length as index""" + with pytest.raises(IndexError): + arr = pd.Index([True, False]).repeat(4) + dates.select_rows(arr) + + +def test_missing_all_array(dates): + """Raise if none of the labels exist.""" + with pytest.raises(KeyError): + arr = pd.array(["2011"]) + dates.select_rows(arr) + + +def test_missing_some_array(dates): + """Raise if some of the labels do not exist.""" + with pytest.raises(KeyError): + arr = pd.array(["2011", "2011-01-31"]) + dates.select_rows(arr) diff --git a/tests/math/test_ecdf.py b/tests/math/test_ecdf.py index 03d71d159..58d3b3ea5 100644 --- a/tests/math/test_ecdf.py +++ b/tests/math/test_ecdf.py @@ -1,10 +1,12 @@ import numpy as np import pytest from hypothesis import given +from hypothesis import settings from hypothesis.extra.pandas import series @given(s=series(dtype=np.number)) +@settings(deadline=None) def test_ecdf(s): """A simple execution test.""" if s.isna().sum() > 0: diff --git a/tests/test_documentation_build.py b/tests/test_documentation_build.py index a10d1b540..353ad6b5d 100644 --- a/tests/test_documentation_build.py +++ b/tests/test_documentation_build.py @@ -1,6 +1,6 @@ """Tests for documentation build.""" -import os +import os import pytest @@ -19,7 +19,7 @@ def test_docs_general_functions_present(): all of the functions are present in the docs. This is an awesome thing that we could use help with in the future. """ - # Build docs using mkdocs + # Build docs using mkdocs os.system("mkdocs build --clean") # We want to check that the following keywords are all present. diff --git a/tests/timeseries/test_fill_missing_timestamps.py b/tests/timeseries/test_fill_missing_timestamps.py index f07186e2d..c6125b5e6 100644 --- a/tests/timeseries/test_fill_missing_timestamps.py +++ b/tests/timeseries/test_fill_missing_timestamps.py @@ -26,7 +26,16 @@ def test_fill_missing_timestamps(timeseries_dataframe): df1 = timeseries_dataframe.drop(timeseries_dataframe.index[random_number]) # Fill missing timestamps - result = fill_missing_timestamps(df1, frequency="1H") + # fix for GH#1184 is to use the start and end from + # timeseries_dataframe + # imagine that the last row of df1 is removed, or the first entry + # the length check in the assert line will fail + result = fill_missing_timestamps( + df1, + frequency="1H", + first_time_stamp=timeseries_dataframe.index.min(), + last_time_stamp=timeseries_dataframe.index.max(), + ) # Testing if the missing timestamp has been filled assert len(result) == len(timeseries_dataframe) diff --git a/tests/utils/test__select_column.py b/tests/utils/test__select_column.py deleted file mode 100644 index fb647535a..000000000 --- a/tests/utils/test__select_column.py +++ /dev/null @@ -1,490 +0,0 @@ -import datetime -import re - -import numpy as np -import pandas as pd -import pytest - -from pandas.testing import assert_index_equal, assert_frame_equal -from janitor.functions.utils import _select_column_names, patterns - - -@pytest.fixture -def df_dates(): - """pytest fixture""" - start = datetime.datetime(2011, 1, 1) - end = datetime.datetime(2012, 1, 1) - rng = pd.date_range(start, end, freq="BM") - return pd.DataFrame([np.random.randn(len(rng))], columns=rng) - - -@pytest.fixture -def df_numbers(): - """pytest fixture""" - return pd.DataFrame([np.random.randn(20)], columns=range(20)) - - -@pytest.fixture -def df(): - """pytest fixture.""" - return pd.DataFrame( - { - "id": [1, 2, 3], - "M_start_date_1": [201709, 201709, 201709], - "M_end_date_1": [201905, 201905, 201905], - "M_start_date_2": [202004, 202004, 202004], - "M_end_date_2": [202005, 202005, 202005], - "F_start_date_1": [201803, 201803, 201803], - "F_end_date_1": [201904, 201904, 201904], - "F_start_date_2": [201912, 201912, 201912], - "F_end_date_2": [202007, 202007, 202007], - } - ) - - -@pytest.fixture -def df1(): - """pytest fixture.""" - return pd.DataFrame( - { - "id": [0, 1], - "Name": ["ABC", "XYZ"], - "code": [1, 2], - "code1": [4, np.nan], - "code2": ["8", 5], - "type": ["S", "R"], - "type1": ["E", np.nan], - "type2": ["T", "U"], - "code3": pd.Series(["a", "b"], dtype="category"), - "type3": pd.to_datetime( - [np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] - ), - } - ) - - -@pytest.fixture -def df_tuple(): - "pytest fixture." - frame = pd.DataFrame( - { - "A": {0: "a", 1: "b", 2: "c"}, - "B": {0: 1, 1: 3, 2: 5}, - "C": {0: 2, 1: 4, 2: 6}, - } - ) - frame.columns = [list("ABC"), list("DEF")] - return frame - - -def test_col_not_found(df): - """Raise KeyError if `columns_to_select` is not in df.columns.""" - with pytest.raises(KeyError, match="No match was returned.+"): - _select_column_names(2.5, df) - - -def test_col_not_found1(df): - """Raise KeyError if `columns_to_select` is not in df.columns.""" - with pytest.raises(KeyError, match="No match was returned.+"): - _select_column_names(1, df) - - -def test_col_not_found2(df): - """Raise KeyError if `columns_to_select` is not in df.columns.""" - with pytest.raises(KeyError, match="No match was returned.+"): - _select_column_names([3, "id"], df) - - -def test_col_not_found3(df_dates): - """Raise KeyError if `columns_to_select` is not in df.columns.""" - with pytest.raises(KeyError): - _select_column_names("id", df_dates) - - -def test_col_not_found4(df_numbers): - """Raise KeyError if `columns_to_select` is not in df.columns.""" - with pytest.raises(KeyError, match=r"No match was returned.+"): - _select_column_names("id", df_numbers) - - -def test_tuple(df_tuple): - """Test _select_column_names function on tuple.""" - assert _select_column_names(("A", "D"), df_tuple) == [("A", "D")] - - -def test_strings(df1): - """Test _select_column_names function on strings.""" - assert _select_column_names("id", df1) == ["id"] - assert _select_column_names("*type*", df1).tolist() == [ - "type", - "type1", - "type2", - "type3", - ] - - -def test_strings_cat(df1): - """Test output on categorical columns""" - df1.columns = df1.columns.astype("category") - assert _select_column_names("id", df1) == ["id"] - assert _select_column_names("*type*", df1).tolist() == [ - "type", - "type1", - "type2", - "type3", - ] - - -def test_strings_do_not_exist(df): - """ - Raise KeyError if `columns_to_select` is a string - and does not exist in the dataframe's columns. - """ - with pytest.raises(KeyError, match="No match was returned for.+"): - _select_column_names("word", df) - - -def test_strings_dates(df_dates): - """ - Test output for datetime column. - """ - assert ( - _select_column_names("2011-01-31", df_dates)[0] - == df_dates.loc[:, "2011-01-31"].name - ) - - -def test_strings_dates_range(df_dates): - """Test output for datetime column.""" - assert_index_equal( - _select_column_names("2011-01", df_dates), - df_dates.loc[:, slice("2011-01")].columns, - ) - - -def test_unsorted_dates(df_dates): - """Test output if the dates are unsorted, and a string is passed.""" - df_dates = df_dates.iloc[:, [10, 4, 7, 2, 1, 3, 5, 6, 8, 9, 11, 0]] - expected = df_dates.loc[:, ["2011-01-31"]] - actual = _select_column_names("2011-01-31", df_dates) - actual = df_dates.loc[:, actual] - assert_frame_equal(expected, actual) - - -def test_regex(df1): - """Test _select_column_names function on regular expressions.""" - assert_index_equal( - _select_column_names(re.compile(r"\d$"), df1), - df1.filter(regex=r"\d$").columns, - ) - - -def test_regex_cat(df1): - """Test output on categorical columns""" - df1.columns = df1.columns.astype("category") - assert_index_equal( - _select_column_names(re.compile(r"\d$"), df1), - df1.filter(regex=r"\d$").columns, - ) - - -def test_patterns_warning(df1): - """ - Check that warning is raised if `janitor.patterns` is used. - """ - with pytest.warns(DeprecationWarning): - assert_index_equal( - _select_column_names(patterns(r"\d$"), df1), - df1.filter(regex=r"\d$").columns, - ) - - -def test_regex_presence_string_column(df): - """ - Raise KeyError if `columns_to_select` is a regex - and does not exist in the dataframe's columns. - """ - with pytest.raises(KeyError, match="No match was returned for.+"): - _select_column_names(re.compile("word"), df) - - -def test_regex_presence(df_dates): - """ - Raise KeyError if `columns_to_select` is a regex - and the columns is not a string column. - """ - with pytest.raises(KeyError, match=r"No match was returned.+"): - _select_column_names(re.compile(r"^\d+"), df_dates) - - -def test_slice_unique(): - """ - Raise ValueError if the columns are not unique. - """ - not_unique = pd.DataFrame([], columns=["code", "code2", "code1", "code"]) - with pytest.raises( - ValueError, - match="Non-unique column labels should be monotonic increasing.", - ): - _select_column_names(slice("code", "code2"), not_unique) - - -def test_slice_presence(df): - """ - Raise ValueError if `columns_to_select` is a slice instance - and either the start value or the end value is not present - in the dataframe. - """ - with pytest.raises(ValueError): - _select_column_names(slice("Id", "M_start_date_1"), df) - with pytest.raises(ValueError): - _select_column_names(slice("id", "M_end_date"), df) - - -def test_slice_dtypes(df): - """ - Raise ValueError if `columns_to_select` is a slice instance - and either the start value or the stop value is not a string, - or the step value is not an integer. - """ - with pytest.raises( - ValueError, - match="The start value for the slice must either be `None`.+", - ): - _select_column_names(slice(1, "M_end_date_2"), df) - with pytest.raises( - ValueError, - match="The stop value for the slice must either be `None`.+", - ): - _select_column_names(slice("id", 2), df) - with pytest.raises(ValueError, match="The step value for the slice.+"): - _select_column_names(slice("id", "M_end_date_2", "3"), df) - - -def test_unsorted_dates_slice(df_dates): - """Raise Error if the dates are unsorted.""" - df_dates = df_dates.iloc[:, ::-1] - with pytest.raises( - ValueError, - match="The column is a DatetimeIndex and should be " - "monotonic increasing.", - ): - _select_column_names(slice("2011-01-31", "2011-03-31"), df_dates) - - -def test_slice(df1): - """Test _select_column_names function on slices.""" - assert_index_equal( - _select_column_names(slice("code", "code2"), df1), - df1.loc[:, slice("code", "code2")].columns, - ) - - assert_index_equal( - _select_column_names(slice("code2", None), df1), - df1.loc[:, slice("code2", None)].columns, - ) - - assert_index_equal( - _select_column_names(slice(None, "code2"), df1), - df1.loc[:, slice(None, "code2")].columns, - ) - - assert_index_equal( - _select_column_names(slice(None, None), df1), df1.columns - ) - assert_index_equal( - _select_column_names(slice(None, None, 2), df1), - df1.loc[:, slice(None, None, 2)].columns, - ) - assert_index_equal( - _select_column_names(slice("code2", "code"), df1), - pd.Index( - [ - "code2", - "code1", - "code", - ] - ), - ) - - -def test_slice_dates(df_dates): - """Test output of slice on date column.""" - assert_index_equal( - _select_column_names(slice("2011-01-31", "2011-03-31"), df_dates), - df_dates.loc[:, "2011-01-31":"2011-03-31"].columns, - ) - - -def test_slice_dates_inexact(df_dates): - """Test output of slice on date column.""" - assert_index_equal( - _select_column_names(slice("2011-01", "2011-03"), df_dates), - df_dates.loc[:, "2011-01":"2011-03"].columns, - ) - - -def test_boolean_list_dtypes(df): - """ - Raise ValueError if `columns_to_select` is a list of booleans - and the length is unequal to the number of columns - in the dataframe. - """ - with pytest.raises( - ValueError, match="The length of the list of booleans.+" - ): - _select_column_names([True, False], df) - with pytest.raises( - ValueError, match="The length of the list of booleans.+" - ): - _select_column_names( - [True, True, True, False, False, False, True, True, True, False], - df, - ) - - -def test_callable(df_numbers): - """ - Check that error is raised if `columns_to_select` is a - callable, and at lease one Series has a wrong data type - that makes the callable unapplicable. - """ - with pytest.raises( - TypeError, - match="The output of the applied callable " - "should be a boolean array.", - ): - _select_column_names(lambda df: df + 3, df_numbers) - - -@pytest.mark.xfail( - reason="Indexing in Pandas is possible with a boolean Series." -) -def test_callable_returns_series(df): - """ - Check that error is raised if `columns_to_select` is a - callable, and returns a Series. - """ - with pytest.raises(ValueError): - _select_column_names(lambda x: x + 1, df) - - -def test_callable_no_match(df): - """ - Raise KeyError if `columns_to_select` is a callable, - and no match is returned. - """ - with pytest.raises(KeyError, match="No match was returned.+"): - _select_column_names(pd.api.types.is_float_dtype, df) - - -def test_tuple_presence(df_tuple): - """ - Raise KeyError if `columns_to_select` is a tuple - and no match is returned. - """ - with pytest.raises(KeyError, match="No match was returned.+"): - _select_column_names(("A", "C"), df_tuple) - - -def test_callable_data_type(df1): - """ - Test _select_column_names function on callables, - specifically for data type checks. - """ - assert_index_equal( - _select_column_names(pd.api.types.is_integer_dtype, df1), - df1.select_dtypes(int).columns, - ) - - assert_index_equal( - _select_column_names(pd.api.types.is_float_dtype, df1), - df1.select_dtypes(float).columns, - ) - - assert_index_equal( - _select_column_names(pd.api.types.is_numeric_dtype, df1), - df1.select_dtypes("number").columns, - ) - - assert_index_equal( - _select_column_names(pd.api.types.is_categorical_dtype, df1), - df1.select_dtypes("category").columns, - ) - - assert_index_equal( - _select_column_names(pd.api.types.is_datetime64_dtype, df1), - df1.select_dtypes(np.datetime64).columns, - ) - - assert_index_equal( - _select_column_names(pd.api.types.is_object_dtype, df1), - df1.select_dtypes("object").columns, - ) - - -def test_callable_string_methods(df1): - """ - Test _select_column_names function on callables, - specifically for column name checks. - """ - assert_index_equal( - _select_column_names(lambda x: x.name.startswith("type"), df1), - df1.filter(like="type").columns, - ) - - assert_index_equal( - _select_column_names(lambda x: x.name.endswith(("1", "2", "3")), df1), - df1.filter(regex=r"\d$").columns, - ) - - assert_index_equal( - _select_column_names(lambda x: "d" in x.name, df1), - df1.filter(regex="d").columns, - ) - - assert_index_equal( - _select_column_names( - lambda x: x.name.startswith("code") and x.name.endswith("1"), df1 - ), - df1.filter(regex=r"code.*1$").columns, - ) - - assert_index_equal( - _select_column_names( - lambda x: x.name.startswith("code") or x.name.endswith("1"), df1 - ), - df1.filter(regex=r"^code.*|.*1$").columns, - ) - - -def test_callable_computations(df1): - """ - Test _select_column_names function on callables, - specifically for computations. - """ - assert_index_equal( - _select_column_names(lambda x: x.isna().any(), df1), - df1.columns[df1.isna().any().array], - ) - - -def test_list_various(df1): - """Test _select_column_names function on list type.""" - - assert _select_column_names(["id", "Name"], df1) == ["id", "Name"] - assert _select_column_names(["id", "code*"], df1) == list( - df1.filter(regex="^id|^code").columns - ) - assert [ - *_select_column_names(["id", "code*", slice("code", "code2")], df1) - ] == df1.filter(regex="^(id|code)").columns.tolist() - assert _select_column_names(["id", "Name"], df1) == ["id", "Name"] - - -def test_list_boolean(df): - """Test _select_column_names function on list of booleans.""" - booleans = [True, True, True, False, False, False, True, True, True] - assert_index_equal( - _select_column_names(booleans, df), df.columns[booleans] - )