From 8b8c8655edd2d0ff3eefe483728c6363f7e4198e Mon Sep 17 00:00:00 2001 From: chrispj Date: Thu, 4 Mar 2021 23:03:45 +0100 Subject: [PATCH 01/10] Add IsTypeValidation --- pandas_schema/validation.py | 30 ++++++++++++++++++++++++++++++ pandas_schema/version.py | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 5f7c763..23ae62b 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -10,6 +10,7 @@ from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype +from typing import List class _BaseValidation: @@ -214,6 +215,35 @@ def validate(self, series: pd.Series) -> pd.Series: return (series >= self.min) & (series < self.max) +class IsTypeValidation(_SeriesValidation): + """ + Description: Checks that each element in the series equals one of the predefined types. + Usage: For example with types str and int: + IsTypeValidation(allowed_types=[str, int]) + """ + + def __init__(self, allowed_types: List, **kwargs): + """ + :param allowed_types: List containing the allowed data types. + """ + self.allowed_types: List = allowed_types + super().__init__(**kwargs) + + @property + def default_message(self): + return f"was not of listed type {self.allowed_types.__str__()}" + + def validate(self, series: pd.Series) -> pd.Series: + # Loop and validate per item (i.e. row) + return_data = [] + for index, value in series.iteritems(): + bool_value: bool = type(value) in self.allowed_types + return_data.append(bool_value) + + # Return as series + return pd.Series(data=return_data, index=series.index) + + class IsDtypeValidation(_BaseValidation): """ Checks that a series has a certain numpy dtype diff --git a/pandas_schema/version.py b/pandas_schema/version.py index 40ed83d..4596d03 100644 --- a/pandas_schema/version.py +++ b/pandas_schema/version.py @@ -1 +1 @@ -__version__ = '0.3.5' +__version__ = '0.3.6' From b56483a00d0467939c872d1aeceba555bc175dc7 Mon Sep 17 00:00:00 2001 From: chrispijo <34818065+chrispijo@users.noreply.github.com> Date: Fri, 5 Mar 2021 11:37:50 +0100 Subject: [PATCH 02/10] Update validation.py Removed typing at line 229 because this caused an issue in Python 3.5. --- pandas_schema/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 23ae62b..cfcc7a7 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -226,7 +226,7 @@ def __init__(self, allowed_types: List, **kwargs): """ :param allowed_types: List containing the allowed data types. """ - self.allowed_types: List = allowed_types + self.allowed_types = allowed_types super().__init__(**kwargs) @property From e382ce5f08dd79dfc41b09886f7b9a17da0b0bf5 Mon Sep 17 00:00:00 2001 From: chrispijo <34818065+chrispijo@users.noreply.github.com> Date: Fri, 5 Mar 2021 11:42:03 +0100 Subject: [PATCH 03/10] Update validation.py Changed line 234 because format `f"text {variable}"` is not allowed in Python 3.5. --- pandas_schema/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index cfcc7a7..cbb4671 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -231,7 +231,7 @@ def __init__(self, allowed_types: List, **kwargs): @property def default_message(self): - return f"was not of listed type {self.allowed_types.__str__()}" + return "was not of listed type {}".format(self.allowed_types.__str__()) def validate(self, series: pd.Series) -> pd.Series: # Loop and validate per item (i.e. row) From b1835a3ae8955b61893dc3a6621758f6c52fb057 Mon Sep 17 00:00:00 2001 From: chrispijo <34818065+chrispijo@users.noreply.github.com> Date: Fri, 5 Mar 2021 11:45:34 +0100 Subject: [PATCH 04/10] Update validation.py Typing in line 240 was not allowed. --- pandas_schema/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index cbb4671..6c5a9d4 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -237,7 +237,7 @@ def validate(self, series: pd.Series) -> pd.Series: # Loop and validate per item (i.e. row) return_data = [] for index, value in series.iteritems(): - bool_value: bool = type(value) in self.allowed_types + bool_value = type(value) in self.allowed_types return_data.append(bool_value) # Return as series From ca8e1e5afe35e52990ae2d095350d0dc844bb409 Mon Sep 17 00:00:00 2001 From: chrispj Date: Sat, 6 Mar 2021 17:38:47 +0100 Subject: [PATCH 05/10] Corrections after feedback --- pandas_schema/validation.py | 69 ++++++++++++++++++++++++++++++------- pandas_schema/version.py | 2 +- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 6c5a9d4..55bbcd2 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -215,11 +215,38 @@ def validate(self, series: pd.Series) -> pd.Series: return (series >= self.min) & (series < self.max) +def convert_type_to_dtype(type_to_convert: type) -> np.dtype: + """ + Converts type to the numpy variant dtype. + :param type_to_convert: The type to convert to np.dtype. + :return: Numpy dtype + """ + # DISLIKE 02: It is doubtful if this function converts all types correctly to numpy in accordance to a Pandas + # Series. + if type_to_convert == int: + return np.dtype(np.int64) # np.dtype(int) results in np.int32. + elif type_to_convert == str: + return np.dtype(object) + else: + return np.dtype(type_to_convert) + + class IsTypeValidation(_SeriesValidation): """ - Description: Checks that each element in the series equals one of the predefined types. - Usage: For example with types str and int: - IsTypeValidation(allowed_types=[str, int]) + Checks that each element in the series equals one of the allowed types. This validation only makes sense for an + object series. + + Examples + -------- + >>> v = IsTypeValidation(allowed_types=[str, int]) + >>> s = pd.Series(data=["alpha", 1.4, True, "beta", 5]) + >>> v.validate(series=s) + 0 True + 1 False + 2 False + 3 True + 4 True + dtype: bool """ def __init__(self, allowed_types: List, **kwargs): @@ -231,17 +258,35 @@ def __init__(self, allowed_types: List, **kwargs): @property def default_message(self): - return "was not of listed type {}".format(self.allowed_types.__str__()) + return f"was not of listed type {self.allowed_types.__str__()}" + + def get_errors(self, series: pd.Series, column: 'column.Column' = None): + + # Numpy dtypes other than 'object' can be validated with IsDtypeValidation instead, but only if the + # allowed_types is singular. Otherwise continue. + # DISLIKE 01: IsDtypeValidation only allows a single dtype. So this if-statement redirects only if one type is + # specified in the list self.allowed_types. + if not series.dtype == np.dtype(object) and len(self.allowed_types) == 1: + allowed_type = convert_type_to_dtype(type_to_convert=self.allowed_types[0]) + new_validation_method = IsDtypeValidation(dtype=np.dtype(allowed_type)) + return new_validation_method.get_errors(series=series) + + # Else, validate each element along the allowed types. + errors = [] + valid_indices = series.index[~self.validate(series)] + for i in valid_indices: + element = series[i] + errors.append(ValidationWarning( + message=self.message, + value=element, + row=i, + column=series.name + )) + + return errors def validate(self, series: pd.Series) -> pd.Series: - # Loop and validate per item (i.e. row) - return_data = [] - for index, value in series.iteritems(): - bool_value = type(value) in self.allowed_types - return_data.append(bool_value) - - # Return as series - return pd.Series(data=return_data, index=series.index) + return series.apply(type).isin(self.allowed_types) class IsDtypeValidation(_BaseValidation): diff --git a/pandas_schema/version.py b/pandas_schema/version.py index 4596d03..bbae3fb 100644 --- a/pandas_schema/version.py +++ b/pandas_schema/version.py @@ -1 +1 @@ -__version__ = '0.3.6' +__version__ = '0.3.5.7' From 2253125b8c5b008e4f6352a14e97d2643c8c8c8c Mon Sep 17 00:00:00 2001 From: chrispj Date: Sat, 6 Mar 2021 17:43:13 +0100 Subject: [PATCH 06/10] Corrections Python 3.5 --- pandas_schema/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 55bbcd2..9557d6f 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -258,7 +258,7 @@ def __init__(self, allowed_types: List, **kwargs): @property def default_message(self): - return f"was not of listed type {self.allowed_types.__str__()}" + return "was not of listed type {}".format(self.allowed_types.__str__()) def get_errors(self, series: pd.Series, column: 'column.Column' = None): From 3598ce92ef243948bcb10ea0a6d0f6a40e4f9e4c Mon Sep 17 00:00:00 2001 From: chrispj Date: Wed, 10 Mar 2021 20:42:50 +0100 Subject: [PATCH 07/10] Feedback IsTypeValidation added. IsDtypeValidation changed --- pandas_schema/validation.py | 91 +++++++++++++++++++------------------ pandas_schema/version.py | 2 +- 2 files changed, 47 insertions(+), 46 deletions(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 9557d6f..aba6f00 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -10,7 +10,7 @@ from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -from typing import List +from typing import List, Union class _BaseValidation: @@ -215,22 +215,6 @@ def validate(self, series: pd.Series) -> pd.Series: return (series >= self.min) & (series < self.max) -def convert_type_to_dtype(type_to_convert: type) -> np.dtype: - """ - Converts type to the numpy variant dtype. - :param type_to_convert: The type to convert to np.dtype. - :return: Numpy dtype - """ - # DISLIKE 02: It is doubtful if this function converts all types correctly to numpy in accordance to a Pandas - # Series. - if type_to_convert == int: - return np.dtype(np.int64) # np.dtype(int) results in np.int32. - elif type_to_convert == str: - return np.dtype(object) - else: - return np.dtype(type_to_convert) - - class IsTypeValidation(_SeriesValidation): """ Checks that each element in the series equals one of the allowed types. This validation only makes sense for an @@ -258,32 +242,17 @@ def __init__(self, allowed_types: List, **kwargs): @property def default_message(self): - return "was not of listed type {}".format(self.allowed_types.__str__()) + return "is not of type listed in {}".format(self.allowed_types.__str__()) - def get_errors(self, series: pd.Series, column: 'column.Column' = None): + def get_errors(self, series: pd.Series, column: 'column.Column'): - # Numpy dtypes other than 'object' can be validated with IsDtypeValidation instead, but only if the - # allowed_types is singular. Otherwise continue. - # DISLIKE 01: IsDtypeValidation only allows a single dtype. So this if-statement redirects only if one type is - # specified in the list self.allowed_types. - if not series.dtype == np.dtype(object) and len(self.allowed_types) == 1: - allowed_type = convert_type_to_dtype(type_to_convert=self.allowed_types[0]) - new_validation_method = IsDtypeValidation(dtype=np.dtype(allowed_type)) - return new_validation_method.get_errors(series=series) + # Numpy dtypes other than 'object' can be validated with IsDtypeValidation instead. + if not series.dtype == np.dtype(object): + np_allowed_types = [np.dtype(allowed_type) for allowed_type in self.allowed_types] + alternative_validation_method = IsDtypeValidation(dtype=np_allowed_types) + return alternative_validation_method.get_errors(series=series, column=column) - # Else, validate each element along the allowed types. - errors = [] - valid_indices = series.index[~self.validate(series)] - for i in valid_indices: - element = series[i] - errors.append(ValidationWarning( - message=self.message, - value=element, - row=i, - column=series.name - )) - - return errors + return super().get_errors(series=series, column=column) def validate(self, series: pd.Series) -> pd.Series: return series.apply(type).isin(self.allowed_types) @@ -291,26 +260,58 @@ def validate(self, series: pd.Series) -> pd.Series: class IsDtypeValidation(_BaseValidation): """ - Checks that a series has a certain numpy dtype + Checks that a series has (one of) the required numpy dtype(s). + + Examples + -------- + >>> v = IsDtypeValidation(dtype=[np.str0, np.float64]) + >>> s = pd.Series(data=np.array([1, 2, 3, 4, 5]), name='IntCol') + >>> err = v.get_errors(series=s, column=Column(name=s.name.__str__())) + >>> err[0].__str__() + "The column IntCol has a dtype of int32 which is not a subclass of the required type [, + ]" """ - def __init__(self, dtype: np.dtype, **kwargs): + def __init__(self, dtype: Union[np.dtype, List[np.dtype]], **kwargs): """ - :param dtype: The numpy dtype to check the column against + :param dtype: The numpy dtype to check the column against. Input can be either a single dtype or a list of + dtypes. """ self.dtype = dtype + if type(dtype) is not list: + self.dtype = [dtype] super().__init__(**kwargs) + @staticmethod + def convert_series_dtype_to_system_default(series: pd.Series) -> pd.Series: + """ On Windows np.dtype(int) returns np.int32, whereas Pandas.Series([1, 2, 3, ..., n]).dtype returns np.int64. + Linux does return np.int64 for np.dtype(int). Other types (float, bool, etc) return equal types. + For this reason, the series is converted back and forth to ensure equal types between pandas and numpy.""" + python_type = type(np.zeros(1, series.dtype).tolist()[0]) # First convert to Python type. + return series.astype(python_type) # Then convert back based on system preference. + def get_errors(self, series: pd.Series, column: 'column.Column' = None): - if not np.issubdtype(series.dtype, self.dtype): + + # Convert to system dependent default numpy dtype. + series_converted_type = self.convert_series_dtype_to_system_default(series=series) + + # Validate and return (possible) error messages + if not self.validate(series=series_converted_type): return [ValidationWarning( 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype + column.name if column else '', series_converted_type.dtype, self.dtype ) )] else: return [] + def validate(self, series: pd.Series) -> bool: + + # Convert to system dependent default numpy dtype. + series_converted_type = self.convert_series_dtype_to_system_default(series=series) + + return True in [np.issubdtype(series_converted_type.dtype, given_dtype) for given_dtype in self.dtype] + class CanCallValidation(_SeriesValidation): """ diff --git a/pandas_schema/version.py b/pandas_schema/version.py index bbae3fb..40ed83d 100644 --- a/pandas_schema/version.py +++ b/pandas_schema/version.py @@ -1 +1 @@ -__version__ = '0.3.5.7' +__version__ = '0.3.5' From 59713cbee6a780008f7752012c820d3e62f73fe2 Mon Sep 17 00:00:00 2001 From: chrispj Date: Wed, 10 Mar 2021 20:56:49 +0100 Subject: [PATCH 08/10] Update test-file --- pandas_schema/validation.py | 2 +- test/test_validation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index aba6f00..222c4f4 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -290,7 +290,7 @@ def convert_series_dtype_to_system_default(series: pd.Series) -> pd.Series: python_type = type(np.zeros(1, series.dtype).tolist()[0]) # First convert to Python type. return series.astype(python_type) # Then convert back based on system preference. - def get_errors(self, series: pd.Series, column: 'column.Column' = None): + def get_errors(self, series: pd.Series, column: 'column.Column' = None) -> list: # Convert to system dependent default numpy dtype. series_converted_type = self.convert_series_dtype_to_system_default(series=series) diff --git a/test/test_validation.py b/test/test_validation.py index fc40100..4afefd0 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -491,7 +491,7 @@ class Dtype(ValidationTestBase): """ def setUp(self): - self.validator = IsDtypeValidation(np.number) + self.validator = IsDtypeValidation(np.dtype(np.number)) def test_valid_items(self): errors = self.validator.get_errors(pd.Series( From f8e593e48591bda52616e0c443ea799aab1cc2b6 Mon Sep 17 00:00:00 2001 From: chrispj Date: Wed, 10 Mar 2021 21:07:42 +0100 Subject: [PATCH 09/10] Removed test-changes --- pandas_schema/validation.py | 6 ++++++ test/test_validation.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 222c4f4..3bb5fb2 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -287,6 +287,12 @@ def convert_series_dtype_to_system_default(series: pd.Series) -> pd.Series: """ On Windows np.dtype(int) returns np.int32, whereas Pandas.Series([1, 2, 3, ..., n]).dtype returns np.int64. Linux does return np.int64 for np.dtype(int). Other types (float, bool, etc) return equal types. For this reason, the series is converted back and forth to ensure equal types between pandas and numpy.""" + + # If not numeric, no conversion necessary + if not np.issubdtype(series.dtype, np.number): + return series + + # Convert python_type = type(np.zeros(1, series.dtype).tolist()[0]) # First convert to Python type. return series.astype(python_type) # Then convert back based on system preference. diff --git a/test/test_validation.py b/test/test_validation.py index 4afefd0..fc40100 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -491,7 +491,7 @@ class Dtype(ValidationTestBase): """ def setUp(self): - self.validator = IsDtypeValidation(np.dtype(np.number)) + self.validator = IsDtypeValidation(np.number) def test_valid_items(self): errors = self.validator.get_errors(pd.Series( From d05f3652517e1d6a0c46d74b25e231badbf76ec5 Mon Sep 17 00:00:00 2001 From: chrispj Date: Mon, 3 May 2021 20:07:58 +0200 Subject: [PATCH 10/10] Added validation of input argument --- pandas_schema/validation.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 3bb5fb2..50b3c90 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -217,8 +217,7 @@ def validate(self, series: pd.Series) -> pd.Series: class IsTypeValidation(_SeriesValidation): """ - Checks that each element in the series equals one of the allowed types. This validation only makes sense for an - object series. + Checks that each element in the series equals one of the provided allowed types. Examples -------- @@ -235,10 +234,25 @@ class IsTypeValidation(_SeriesValidation): def __init__(self, allowed_types: List, **kwargs): """ - :param allowed_types: List containing the allowed data types. + :param allowed_types: List describing which types are allowed. The list may only contain the build-in + Python-types "str", "int", "float" and/or "bool". """ + self._allowed_build_in_types = [str, int, float, bool] self.allowed_types = allowed_types super().__init__(**kwargs) + self._validate_input() + + def _validate_input(self): + if type(self.allowed_types) != list: + raise PanSchArgumentError('The argument "allowed_types" passed to IsTypeValidation is not of type list. ' + 'Provide a list containing one or more of the Python built-in types "str", ' + '"int", "float" or "bool".') + + for allowed_type in self.allowed_types: + if allowed_type not in self._allowed_build_in_types: + raise PanSchArgumentError('The item "{}" provided in the argument "allowed_types" as passed to ' + 'IsTypeValidation is not of the correct type. Provide one of Python built-in ' + 'types "str", "int", "float" or "bool".'.format(allowed_type)) @property def default_message(self):