From 2f7e7f3d3295bb0c9e3069b6afe2dc2a37b167a1 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 18 Nov 2019 01:24:46 +1100 Subject: [PATCH 01/13] Initial groundwork for the rewrite --- pandas_schema/column.py | 63 ++++-- pandas_schema/errors.py | 6 + pandas_schema/schema.py | 2 +- pandas_schema/validation.py | 411 +++--------------------------------- 4 files changed, 85 insertions(+), 397 deletions(-) diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 199b883..cec4153 100644 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -4,24 +4,51 @@ from . import validation from .validation_warning import ValidationWarning -class Column: - def __init__(self, name: str, validations: typing.Iterable['validation._BaseValidation'] = [], allow_empty=False): - """ - Creates a new Column object - :param name: The column header that defines this column. This must be identical to the header used in the CSV/Data Frame you are validating. - :param validations: An iterable of objects implementing _BaseValidation that will generate ValidationErrors - :param allow_empty: True if an empty column is considered valid. False if we leave that logic up to the Validation - """ - self.name = name - self.validations = list(validations) - self.allow_empty = allow_empty +def _column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: typing.Union[int, str] = None, + position: bool = False +): + """ + A utility method for setting the index data on a set of Validations + :param validations: A list of validations to modify + :param index: The index of the series that these validations will now consider + :param position: If true, these validations use positional indexing. + See :py:class:`pandas_schema.validation.IndexSeriesValidation` + """ + for valid in validations: + valid.index = index + valid.position = position - def validate(self, series: pd.Series) -> typing.List[ValidationWarning]: - """ - Creates a list of validation errors using the Validation objects contained in the Column - :param series: A pandas Series to validate - :return: An iterable of ValidationError instances generated by the validation - """ - return [error for validation in self.validations for error in validation.get_errors(series, self)] +def label_column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: typing.Union[int, str], +): + """ + A utility method for setting the label-based column for each validation + :param validations: A list of validations to modify + :param index: The label of the series that these validations will now consider + """ + return _column( + validations, + index, + position=False + ) + + +def positional_column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: int, +): + """ + A utility method for setting the position-based column for each validation + :param validations: A list of validations to modify + :param index: The index of the series that these validations will now consider + """ + return _column( + validations, + index, + position=True + ) diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py index a9176bf..ab5e73d 100644 --- a/pandas_schema/errors.py +++ b/pandas_schema/errors.py @@ -10,6 +10,12 @@ class PanSchInvalidSchemaError(PanSchError): """ +class PanSchNoIndexError(PanSchInvalidSchemaError): + """ + A validation was provided that has not specified an index + """ + + class PanSchArgumentError(PanSchError): """ An argument passed to a function has an invalid type or value diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 5c0442e..13d8158 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -11,7 +11,7 @@ class Schema: A schema that defines the columns required in the target DataFrame """ - def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): + def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): """ :param columns: A list of column objects :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 2a3f2f8..9343d7b 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -8,412 +8,67 @@ from . import column from .validation_warning import ValidationWarning -from .errors import PanSchArgumentError +from .errors import PanSchArgumentError, PanSchNoIndexError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class _BaseValidation: - """ - The validation base class that defines any object that can create a list of errors from a Series - """ - __metaclass__ = abc.ABCMeta - +class _BaseValidation(abc.ABC): @abc.abstractmethod - def get_errors(self, series: pd.Series, column: 'column.Column') -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ - Return a list of errors in the given series - :param series: - :param column: - :return: + Validates a data frame + :param df: Data frame to validate + :return: All validation failures detected by this validation """ class _SeriesValidation(_BaseValidation): """ - Implements the _BaseValidation interface by returning a Boolean series for each element that either passes or - fails the validation + A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation + to it """ - __metaclass__ = abc.ABCMeta - - def __init__(self, **kwargs): - self._custom_message = kwargs.get('message') - - @property - def message(self): - return self._custom_message or self.default_message - - @abc.abstractproperty - def default_message(self) -> str: - """ - Create a message to be displayed whenever this validation fails - This should be a generic message for the validation type, but can be overwritten if the user provides a - message kwarg - """ @abc.abstractmethod - def validate(self, series: pd.Series) -> pd.Series: - """ - Returns a Boolean series, where each value of False is an element in the Series that has failed the validation - :param series: - :return: - """ - - def __invert__(self): - """ - Returns a negated version of this validation - """ - return _InverseValidation(self) - - def __or__(self, other: '_SeriesValidation'): - """ - Returns a validation which is true if either this or the other validation is true - """ - return _CombinedValidation(self, other, operator.or_) - - def __and__(self, other: '_SeriesValidation'): + def select_series(self, df: pd.DataFrame) -> pd.Series: """ - Returns a validation which is true if either this or the other validation is true + Selects a series from the DataFrame that will be validated """ - return _CombinedValidation(self, other, operator.and_) - - def get_errors(self, series: pd.Series, column: 'column.Column'): - - errors = [] - - # Calculate which columns are valid using the child class's validate function, skipping empty entries if the - # column specifies to do so - simple_validation = ~self.validate(series) - if column.allow_empty: - # Failing results are those that are not empty, and fail the validation - # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is - if is_categorical_dtype(series) or is_numeric_dtype(series): - validated = ~series.isnull() & simple_validation - else: - validated = (series.str.len() > 0) & simple_validation - - else: - validated = simple_validation - - # Cut down the original series to only ones that failed the validation - indices = series.index[validated] - - # Use these indices to find the failing items. Also print the index which is probably a row number - for i in indices: - element = series[i] - errors.append(ValidationWarning( - message=self.message, - value=element, - row=i, - column=series.name - )) - - return errors - - -class _InverseValidation(_SeriesValidation): - """ - Negates an ElementValidation - """ - - def __init__(self, validation: _SeriesValidation): - self.negated = validation - super().__init__() - - def validate(self, series: pd.Series): - return ~ self.negated.validate(series) - - @property - def default_message(self): - return self.negated.message + ' ' - - -class _CombinedValidation(_SeriesValidation): - """ - Validates if one and/or the other validation is true for an element - """ - - def __init__(self, validation_a: _SeriesValidation, validation_b: _SeriesValidation, operator): - self.operator = operator - self.v_a = validation_a - self.v_b = validation_b - super().__init__() - - def validate(self, series: pd.Series): - return self.operator(self.v_a.validate(series), self.v_b.validate(series)) - - @property - def default_message(self): - return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) - - -class CustomSeriesValidation(_SeriesValidation): - """ - Validates using a user-provided function that operates on an entire series (for example by using one of the pandas - Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) - """ - - def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): - """ - :param message: The error message to provide to the user if this validation fails. The row and column and - failing value will automatically be prepended to this message, so you only have to provide a message that - describes what went wrong, for example 'failed my validation' will become - - {row: 1, column: "Column Name"}: "Value" failed my validation - :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal - to True if the object passed validation, and False if it failed - """ - self._validation = validation - super().__init__(message=message) - - def validate(self, series: pd.Series) -> pd.Series: - return self._validation(series) - - -class CustomElementValidation(_SeriesValidation): - """ - Validates using a user-provided function that operates on each element - """ - - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): - """ - :param message: The error message to provide to the user if this validation fails. The row and column and - failing value will automatically be prepended to this message, so you only have to provide a message that - describes what went wrong, for example 'failed my validation' will become - - {row: 1, column: "Column Name"}: "Value" failed my validation - :param validation: A function that takes the value of a data frame cell and returns True if it passes the - the validation, and false if it doesn't - """ - self._validation = validation - super().__init__(message=message) - - def validate(self, series: pd.Series) -> pd.Series: - return series.apply(self._validation) - - -class InRangeValidation(_SeriesValidation): - """ - Checks that each element in the series is within a given numerical range - """ - - def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): - """ - :param min: The minimum (inclusive) value to accept - :param max: The maximum (exclusive) value to accept - """ - self.min = min - self.max = max - super().__init__(**kwargs) - - @property - def default_message(self): - return 'was not in the range [{}, {})'.format(self.min, self.max) - - def validate(self, series: pd.Series) -> pd.Series: - series = pd.to_numeric(series) - return (series >= self.min) & (series < self.max) - -class IsDtypeValidation(_BaseValidation): - """ - Checks that a series has a certain numpy dtype - """ - - def __init__(self, dtype: np.dtype, **kwargs): - """ - :param dtype: The numpy dtype to check the column against - """ - self.dtype = dtype - super().__init__(**kwargs) - - def get_errors(self, series: pd.Series, column: 'column.Column' = None): - if not np.issubdtype(series.dtype, self.dtype): - return [ValidationWarning( - 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype - ) - )] - else: - return [] - - -class CanCallValidation(_SeriesValidation): - """ - Validates if a given function can be called on each element in a column without raising an exception - """ - - def __init__(self, func: typing.Callable, **kwargs): - """ - :param func: A python function that will be called with the value of each cell in the DataFrame. If this - function throws an error, this cell is considered to have failed the validation. Otherwise it has passed. - """ - if callable(type): - self.callable = func - else: - raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) - super().__init__(**kwargs) - - @property - def default_message(self): - return 'raised an exception when the callable {} was called on it'.format(self.callable) - - def can_call(self, var): - try: - self.callable(var) - return True - except: - return False - - def validate(self, series: pd.Series) -> pd.Series: - return series.apply(self.can_call) - - -class CanConvertValidation(CanCallValidation): - """ - Checks if each element in a column can be converted to a Python object type - """ - - """ - Internally this uses the same logic as CanCallValidation since all types are callable in python. - However this class overrides the error messages to make them more directed towards types - """ - - def __init__(self, _type: type, **kwargs): + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: """ - :param _type: Any python type. Its constructor will be called with the value of the individual cell as its - only argument. If it throws an exception, the value is considered to fail the validation, otherwise it has passed + Validate a single series """ - if isinstance(_type, type): - super(CanConvertValidation, self).__init__(_type, **kwargs) - else: - raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): - return 'cannot be converted to type {}'.format(self.callable) + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + series = self.select_series(df) + return self.validate_series(series) -class MatchesPatternValidation(_SeriesValidation): +class IndexSeriesValidation(_SeriesValidation): """ - Validates that a string or regular expression can match somewhere in each element in this column + Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation + or later """ - def __init__(self, pattern, options={}, **kwargs): + def __init__(self, index: typing.Union[int, str] = None, position: bool = False): """ - :param kwargs: Arguments to pass to Series.str.contains - (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html) - pat is the only required argument + Creates a new IndexSeriesValidation + :param index: An index with which to select the series + :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). + Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.pattern = pattern - self.options = options - super().__init__(**kwargs) - - @property - def default_message(self): - return 'does not match the pattern "{}"'.format(self.pattern) - - def validate(self, series: pd.Series) -> pd.Series: - return series.astype(str).str.contains(self.pattern, **self.options) - - -class TrailingWhitespaceValidation(_SeriesValidation): - """ - Checks that there is no trailing whitespace in this column - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains trailing whitespace' - - def validate(self, series: pd.Series) -> pd.Series: - return ~series.astype(str).str.contains('\s+$') - - -class LeadingWhitespaceValidation(_SeriesValidation): - """ - Checks that there is no leading whitespace in this column - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains leading whitespace' - - def validate(self, series: pd.Series) -> pd.Series: - return ~series.astype(str).str.contains('^\s+') - - -class IsDistinctValidation(_SeriesValidation): - """ - Checks that every element of this column is different from each other element - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains values that are not unique' + self.index = column + self.position = position - def validate(self, series: pd.Series) -> pd.Series: - return ~series.duplicated(keep='first') - - -class InListValidation(_SeriesValidation): - """ - Checks that each element in this column is contained within a list of possibilities - """ - - def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwargs): + def select_series(self, df: pd.DataFrame) -> pd.Series: """ - :param options: A list of values to check. If the value of a cell is in this list, it is considered to pass the - validation + Select a series using the data stored in this validation """ - self.case_sensitive = case_sensitive - self.options = options - super().__init__(**kwargs) - - @property - def default_message(self): - values = ', '.join(str(v) for v in self.options) - return 'is not in the list of legal options ({})'.format(values) + if self.index is None: + raise PanSchNoIndexError() - def validate(self, series: pd.Series) -> pd.Series: - if self.case_sensitive: - return series.isin(self.options) + if self.position: + return df.iloc[self.index] else: - return series.str.lower().isin([s.lower() for s in self.options]) - - -class DateFormatValidation(_SeriesValidation): - """ - Checks that each element in this column is a valid date according to a provided format string - """ - - def __init__(self, date_format: str, **kwargs): - """ - :param date_format: The date format string to validate the column against. Refer to the date format code - documentation at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for a full - list of format codes - """ - self.date_format = date_format - super().__init__(**kwargs) - - @property - def default_message(self): - return 'does not match the date format string "{}"'.format(self.date_format) - - def valid_date(self, val): - try: - datetime.datetime.strptime(val, self.date_format) - return True - except: - return False - - def validate(self, series: pd.Series) -> pd.Series: - return series.astype(str).apply(self.valid_date) + return df.loc[self.index] From e92045d12b4ca8c4d0e7ea26ac94ec7f4a4cc9c0 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Wed, 25 Dec 2019 01:59:06 +1100 Subject: [PATCH 02/13] InRangeValidation working with tests --- .gitignore | 0 .travis.yml | 0 LICENSE | 0 README.rst | 0 doc/common/introduction.rst | 0 doc/readme/README.rst | 0 doc/readme/conf.py | 0 doc/site/Makefile | 0 doc/site/conf.py | 0 doc/site/index.rst | 0 example/boolean.py | 0 example/boolean.txt | 0 example/example.py | 0 example/example.txt | 0 pandas_schema/__init__.py | 2 -- pandas_schema/column.py | 11 +++---- pandas_schema/{validation.py => core.py} | 39 ++++++++++++++++++++++-- pandas_schema/errors.py | 0 pandas_schema/schema.py | 1 - pandas_schema/validation_warning.py | 0 pandas_schema/validations.py | 27 ++++++++++++++++ pandas_schema/version.py | 0 requirements.txt | 0 test/__init__.py | 0 test/test_column.py | 0 test/test_example.py | 0 test/test_metadata.py | 0 test/test_schema.py | 0 test/test_validation.py | 8 ++--- test/test_validation_warning.py | 0 30 files changed, 72 insertions(+), 16 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 .travis.yml mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.rst mode change 100644 => 100755 doc/common/introduction.rst mode change 100644 => 100755 doc/readme/README.rst mode change 100644 => 100755 doc/readme/conf.py mode change 100644 => 100755 doc/site/Makefile mode change 100644 => 100755 doc/site/conf.py mode change 100644 => 100755 doc/site/index.rst mode change 100644 => 100755 example/boolean.py mode change 100644 => 100755 example/boolean.txt mode change 100644 => 100755 example/example.py mode change 100644 => 100755 example/example.txt mode change 100644 => 100755 pandas_schema/__init__.py mode change 100644 => 100755 pandas_schema/column.py rename pandas_schema/{validation.py => core.py} (65%) mode change 100644 => 100755 mode change 100644 => 100755 pandas_schema/errors.py mode change 100644 => 100755 pandas_schema/schema.py mode change 100644 => 100755 pandas_schema/validation_warning.py create mode 100755 pandas_schema/validations.py mode change 100644 => 100755 pandas_schema/version.py mode change 100644 => 100755 requirements.txt mode change 100644 => 100755 test/__init__.py mode change 100644 => 100755 test/test_column.py mode change 100644 => 100755 test/test_example.py mode change 100644 => 100755 test/test_metadata.py mode change 100644 => 100755 test/test_schema.py mode change 100644 => 100755 test/test_validation.py mode change 100644 => 100755 test/test_validation_warning.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.travis.yml b/.travis.yml old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/doc/common/introduction.rst b/doc/common/introduction.rst old mode 100644 new mode 100755 diff --git a/doc/readme/README.rst b/doc/readme/README.rst old mode 100644 new mode 100755 diff --git a/doc/readme/conf.py b/doc/readme/conf.py old mode 100644 new mode 100755 diff --git a/doc/site/Makefile b/doc/site/Makefile old mode 100644 new mode 100755 diff --git a/doc/site/conf.py b/doc/site/conf.py old mode 100644 new mode 100755 diff --git a/doc/site/index.rst b/doc/site/index.rst old mode 100644 new mode 100755 diff --git a/example/boolean.py b/example/boolean.py old mode 100644 new mode 100755 diff --git a/example/boolean.txt b/example/boolean.txt old mode 100644 new mode 100755 diff --git a/example/example.py b/example/example.py old mode 100644 new mode 100755 diff --git a/example/example.txt b/example/example.txt old mode 100644 new mode 100755 diff --git a/pandas_schema/__init__.py b/pandas_schema/__init__.py old mode 100644 new mode 100755 index 6f7ff97..fabe184 --- a/pandas_schema/__init__.py +++ b/pandas_schema/__init__.py @@ -1,4 +1,2 @@ -from .column import Column from .validation_warning import ValidationWarning -from .schema import Schema from .version import __version__ diff --git a/pandas_schema/column.py b/pandas_schema/column.py old mode 100644 new mode 100755 index cec4153..e0df39a --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,12 +1,11 @@ import typing import pandas as pd -from . import validation +import pandas_schema.core from .validation_warning import ValidationWarning - def _column( - validations: typing.Iterable[validation.IndexSeriesValidation], + validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: typing.Union[int, str] = None, position: bool = False ): @@ -21,9 +20,8 @@ def _column( valid.index = index valid.position = position - def label_column( - validations: typing.Iterable[validation.IndexSeriesValidation], + validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: typing.Union[int, str], ): """ @@ -37,9 +35,8 @@ def label_column( position=False ) - def positional_column( - validations: typing.Iterable[validation.IndexSeriesValidation], + validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: int, ): """ diff --git a/pandas_schema/validation.py b/pandas_schema/core.py old mode 100644 new mode 100755 similarity index 65% rename from pandas_schema/validation.py rename to pandas_schema/core.py index 9343d7b..9b3c2fc --- a/pandas_schema/validation.py +++ b/pandas_schema/core.py @@ -5,6 +5,7 @@ import numpy as np import typing import operator +import re from . import column from .validation_warning import ValidationWarning @@ -13,6 +14,9 @@ class _BaseValidation(abc.ABC): + """ + A validation is, broadly, just a function that maps a data frame to a list of errors + """ @abc.abstractmethod def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ @@ -51,15 +55,42 @@ class IndexSeriesValidation(_SeriesValidation): or later """ - def __init__(self, index: typing.Union[int, str] = None, position: bool = False): + def __init__(self, index: typing.Union[int, str] = None, position: bool = False, message:str=None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.index = column + self.index = index self.position = position + self.custom_message = message + + @property + def message(self): + """ + Gets a message describing how the DataFrame cell failed the validation + This shouldn't really be overridden, instead override default_message so that users can still set per-object + messages + :return: + """ + return self.custom_message or self.default_message + + @property + def readable_name(self): + """ + A readable name for this validation, to be shown in validation warnings + """ + return type(self).__name__ + + @property + def default_message(self) -> str: + """ + Create a message to be displayed whenever this validation fails + This should be a generic message for the validation type, but can be overwritten if the user provides a + message kwarg + """ + return 'failed the {}'.format(self.readable_name) def select_series(self, df: pd.DataFrame) -> pd.Series: """ @@ -72,3 +103,7 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return df.iloc[self.index] else: return df.loc[self.index] + + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + pass diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py old mode 100644 new mode 100755 diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py old mode 100644 new mode 100755 index 13d8158..da27d81 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -3,7 +3,6 @@ from .errors import PanSchInvalidSchemaError, PanSchArgumentError from .validation_warning import ValidationWarning -from .column import Column class Schema: diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py old mode 100644 new mode 100755 diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py new file mode 100755 index 0000000..14c3df8 --- /dev/null +++ b/pandas_schema/validations.py @@ -0,0 +1,27 @@ +from .core import _SeriesValidation, IndexSeriesValidation +from .validation_warning import ValidationWarning +import pandas as pd +import math +import typing + +class InRangeValidation(IndexSeriesValidation): + """ + Checks that each element in the series is within a given numerical range + """ + + def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): + """ + :param min: The minimum (inclusive) value to accept + :param max: The maximum (exclusive) value to accept + """ + self.min = min + self.max = max + super().__init__(**kwargs) + + @property + def default_message(self): + return 'was not in the range [{}, {})'.format(self.min, self.max) + + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + series = pd.to_numeric(series) + return (series >= self.min) & (series < self.max) diff --git a/pandas_schema/version.py b/pandas_schema/version.py old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/test/__init__.py b/test/__init__.py old mode 100644 new mode 100755 diff --git a/test/test_column.py b/test/test_column.py old mode 100644 new mode 100755 diff --git a/test/test_example.py b/test/test_example.py old mode 100644 new mode 100755 diff --git a/test/test_metadata.py b/test/test_metadata.py old mode 100644 new mode 100755 diff --git a/test/test_schema.py b/test/test_schema.py old mode 100644 new mode 100755 diff --git a/test/test_validation.py b/test/test_validation.py old mode 100644 new mode 100755 index 7914025..d8928dc --- a/test/test_validation.py +++ b/test/test_validation.py @@ -3,10 +3,10 @@ import re from numpy import nan, dtype +import pandas as pd -from pandas_schema import Column, Schema -from pandas_schema.validation import _BaseValidation -from pandas_schema.validation import * +from pandas_schema.validations import InRangeValidation +from pandas_schema.core import _BaseValidation from pandas_schema import ValidationWarning @@ -32,7 +32,7 @@ def validate_and_compare(self, series: list, expected_result: bool, msg: str = N self.addTypeEqualityFunc(pd.Series, self.seriesEquality) # Convert the input list to a series and validate it - results = self.validator.validate(pd.Series(series, dtype=series_dtype)) + results = self.validator.validate_series(pd.Series(series, dtype=series_dtype)) # Now find any items where their validation does not correspond to the expected_result for item, result in zip(series, results): diff --git a/test/test_validation_warning.py b/test/test_validation_warning.py old mode 100644 new mode 100755 From dcb04c45e0b922a7a7e7aba2d764d359e5a64ef4 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 21 Jan 2020 00:54:36 +1100 Subject: [PATCH 03/13] Clarify and cleanup Warning class, add back in the standard validations like LeadingWhitespace --- pandas_schema/core.py | 104 +++++++++++++++-- pandas_schema/validations.py | 217 ++++++++++++++++++++++++++++++++++- test/test_validation.py | 15 ++- 3 files changed, 313 insertions(+), 23 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 9b3c2fc..08807b3 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -8,30 +8,61 @@ import re from . import column -from .validation_warning import ValidationWarning from .errors import PanSchArgumentError, PanSchNoIndexError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class _BaseValidation(abc.ABC): +class BaseValidation(abc.ABC): """ A validation is, broadly, just a function that maps a data frame to a list of errors """ + @abc.abstractmethod - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: """ Validates a data frame :param df: Data frame to validate :return: All validation failures detected by this validation """ + class Warning: + """ + Represents a difference between the schema and data frame, found during the validation of the data frame + Child classes can define their own subclass of :py:class:~pandas_schema.core.BaseValidation.Warning, but + need only do that if the subclass needs to store additional data. + """ + + def __init__(self, validation: 'BaseValidation', message: str): + self.message = message -class _SeriesValidation(_BaseValidation): + def __str__(self) -> str: + """ + The entire warning message as a string + """ + return self.message + + +class SeriesValidation(BaseValidation): """ A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation to it """ + class Warning(BaseValidation.Warning): + """ + Represents a difference between the schema and data frame, found during the validation of the data frame + """ + + def __init__(self, validation: BaseValidation, message: str, series: pd.Series): + super().__init__(validation, message) + self.series = series + + def __str__(self) -> str: + """ + The entire warning message as a string + """ + return '{} {}'.format(self.series.name, self.message) + @abc.abstractmethod def select_series(self, df: pd.DataFrame) -> pd.Series: """ @@ -39,31 +70,46 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: """ @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: """ Validate a single series """ - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: series = self.select_series(df) return self.validate_series(series) -class IndexSeriesValidation(_SeriesValidation): +class IndexSeriesValidation(SeriesValidation): """ Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation or later """ + class Warning(SeriesValidation.Warning): + """ + Represents a difference between the schema and data frame, found during the validation of the data frame + """ + + def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional): + super().__init__(validation, message, series) + self.col_index = col_index + self.positional = positional - def __init__(self, index: typing.Union[int, str] = None, position: bool = False, message:str=None): + def __str__(self) -> str: + """ + The entire warning message as a string + """ + return 'Column {} {}'.format(self.col_index, self.message) + + def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series - :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). + :param positional: If true, the index is a position along the axis (ie, index=0 indicates the first element). Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ self.index = index - self.position = position + self.positional = positional self.custom_message = message @property @@ -99,11 +145,45 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: if self.index is None: raise PanSchNoIndexError() - if self.position: + if self.positional: return df.iloc[self.index] else: return df.loc[self.index] @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: pass + + +class BooleanSeriesValidation(IndexSeriesValidation): + """ + Validation is defined by the function :py:meth:~select_cells that returns a boolean series. + Each cell that has False has failed the validation. + + Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, + because the data is in the same form for each cell. You need only define a :py:meth~default_message. + """ + class Warning(IndexSeriesValidation.Warning): + def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional, row_index, value): + super().__init__(validation, message, series, col_index, positional) + self.row_index = row_index + self.value = value + + def __str__(self) -> str: + return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row_index, self.col_index, self.value, self.message) + + @abc.abstractmethod + def select_cells(self, series: pd.Series) -> pd.Series: + """ + A BooleanSeriesValidation must return a boolean series. Each cell that has False has failed the + validation + :param series: The series to validate + """ + pass + + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + indices = self.select_cells(series) + cells = series[indices] + return ( + Warning(self, self.message, series, self.index, self.positional, row_idx, cell) for row_idx, cell in cells.items() + ) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 14c3df8..58b9396 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -1,10 +1,35 @@ -from .core import _SeriesValidation, IndexSeriesValidation +from .core import SeriesValidation, IndexSeriesValidation, BooleanSeriesValidation from .validation_warning import ValidationWarning +from .errors import PanSchError, PanSchArgumentError +import numpy as np import pandas as pd import math import typing +import datetime -class InRangeValidation(IndexSeriesValidation): + +class IsDtypeValidation(IndexSeriesValidation): + """ + Checks that a series has a certain numpy dtype + """ + + def __init__(self, dtype: np.dtype, **kwargs): + """ + :param dtype: The numpy dtype to check the column against + """ + self.dtype = dtype + super().__init__(**kwargs) + + @property + def default_message(self): + return 'did not have the dtype "{}"'.format(self.dtype.name) + + def validate_series(self, series: pd.Series): + if not series.dtype == self.dtype: + return [self.Warning(self, self.message, series, self.index, self.positional)] + + +class InRangeValidation(BooleanSeriesValidation): """ Checks that each element in the series is within a given numerical range """ @@ -18,10 +43,192 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) + def select_cells(self, series: pd.Series) -> pd.Series: + series = pd.to_numeric(series) + return (series >= self.min) & (series < self.max) + @property def default_message(self): return 'was not in the range [{}, {})'.format(self.min, self.max) - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - series = pd.to_numeric(series) - return (series >= self.min) & (series < self.max) + +class CanCallValidation(BooleanSeriesValidation): + """ + Validates if a given function can be called on each element in a column without raising an exception + """ + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.apply(self.can_call) + + def __init__(self, func: typing.Callable, **kwargs): + """ + :param func: A python function that will be called with the value of each cell in the DataFrame. If this + function throws an error, this cell is considered to have failed the validation. Otherwise it has passed. + """ + if callable(type): + self.callable = func + else: + raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) + super().__init__(**kwargs) + + @property + def default_message(self): + return 'raised an exception when the callable {} was called on it'.format(self.callable) + + def can_call(self, var): + try: + self.callable(var) + return True + except: + return False + + +class CanConvertValidation(CanCallValidation): + """ + Checks if each element in a column can be converted to a Python object type + """ + + """ + Internally this uses the same logic as CanCallValidation since all types are callable in python. + However this class overrides the error messages to make them more directed towards types + """ + + def __init__(self, _type: type, **kwargs): + """ + :param _type: Any python type. Its constructor will be called with the value of the individual cell as its + only argument. If it throws an exception, the value is considered to fail the validation, otherwise it has passed + """ + if isinstance(_type, type): + super(CanConvertValidation, self).__init__(_type, **kwargs) + else: + raise PanSchArgumentError('{} is not a valid type'.format(_type)) + + @property + def default_message(self): + return 'cannot be converted to type {}'.format(self.callable) + + +class MatchesPatternValidation(BooleanSeriesValidation): + """ + Validates that a string or regular expression can match somewhere in each element in this column + """ + + def __init__(self, pattern, options={}, **kwargs): + """ + :param kwargs: Arguments to pass to Series.str.contains + (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html) + pat is the only required argument + """ + self.pattern = pattern + self.options = options + super().__init__(**kwargs) + + @property + def default_message(self): + return 'does not match the pattern "{}"'.format(self.pattern) + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.astype(str).str.contains(self.pattern, **self.options) + + +class TrailingWhitespaceValidation(BooleanSeriesValidation): + """ + Checks that there is no trailing whitespace in this column + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @property + def default_message(self): + return 'contains trailing whitespace' + + def select_cells(self, series: pd.Series) -> pd.Series: + return ~series.astype(str).str.contains('\s+$') + + +class LeadingWhitespaceValidation(BooleanSeriesValidation): + """ + Checks that there is no leading whitespace in this column + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @property + def default_message(self): + return 'contains leading whitespace' + + def select_cells(self, series: pd.Series) -> pd.Series: + return ~series.astype(str).str.contains('^\s+') + + +class IsDistinctValidation(BooleanSeriesValidation): + """ + Checks that every element of this column is different from each other element + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @property + def default_message(self): + return 'contains values that are not unique' + + def select_cells(self, series: pd.Series) -> pd.Series: + return ~series.duplicated(keep='first') + + +class InListValidation(BooleanSeriesValidation): + """ + Checks that each element in this column is contained within a list of possibilities + """ + + def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwargs): + """ + :param options: A list of values to check. If the value of a cell is in this list, it is considered to pass the + validation + """ + self.case_sensitive = case_sensitive + self.options = options + super().__init__(**kwargs) + + @property + def default_message(self): + values = ', '.join(str(v) for v in self.options) + return 'is not in the list of legal options ({})'.format(values) + + def select_cells(self, series: pd.Series) -> pd.Series: + if self.case_sensitive: + return series.isin(self.options) + else: + return series.str.lower().isin([s.lower() for s in self.options]) + + +class DateFormatValidation(BooleanSeriesValidation): + """ + Checks that each element in this column is a valid date according to a provided format string + """ + + def __init__(self, date_format: str, **kwargs): + """ + :param date_format: The date format string to validate the column against. Refer to the date format code + documentation at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for a full + list of format codes + """ + self.date_format = date_format + super().__init__(**kwargs) + + @property + def default_message(self): + return 'does not match the date format string "{}"'.format(self.date_format) + + def valid_date(self, val): + try: + datetime.datetime.strptime(val, self.date_format) + return True + except: + return False + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.astype(str).apply(self.valid_date) diff --git a/test/test_validation.py b/test/test_validation.py index d8928dc..fef3958 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -1,3 +1,6 @@ +""" +Tests for pandas_schema.validations +""" import json import unittest import re @@ -5,8 +8,8 @@ from numpy import nan, dtype import pandas as pd -from pandas_schema.validations import InRangeValidation -from pandas_schema.core import _BaseValidation +from pandas_schema.validations import * +from pandas_schema.core import BooleanSeriesValidation from pandas_schema import ValidationWarning @@ -15,24 +18,24 @@ def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): if not s1.equals(s2): raise self.failureException(msg) - def validate_and_compare(self, series: list, expected_result: bool, msg: str = None, series_dtype: object = None): + def validate_and_compare(self, series: list, expected_result: bool, msg: str = None): """ Checks that every element in the provided series is equal to `expected_result` after validation - :param series_dtype: Explicity specifies the dtype for the generated Series + :param series_dtype: Explicitly specifies the dtype for the generated Series :param series: The series to check :param expected_result: Whether the elements in this series should pass the validation :param msg: The message to display if this test fails """ # Check that self.validator is correct - if not self.validator or not isinstance(self.validator, _BaseValidation): + if not self.validator or not isinstance(self.validator, BooleanSeriesValidation): raise ValueError('The class must have the validator field set to an instance of a Validation subclass') # Ensure we're comparing series correctly self.addTypeEqualityFunc(pd.Series, self.seriesEquality) # Convert the input list to a series and validate it - results = self.validator.validate_series(pd.Series(series, dtype=series_dtype)) + results = self.validator.select_cells(pd.Series(series)) # Now find any items where their validation does not correspond to the expected_result for item, result in zip(series, results): From 8bffe9357090526aa9d424c5a44e57f744c6a401 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 24 Jan 2020 01:13:13 +1100 Subject: [PATCH 04/13] Re-use some old validations for nicer diff; fix some tests --- pandas_schema/validations.py | 100 ++++++++++++++++++++++++++--------- test/test_validation.py | 24 ++++----- 2 files changed, 88 insertions(+), 36 deletions(-) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 58b9396..ea36595 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -1,32 +1,61 @@ -from .core import SeriesValidation, IndexSeriesValidation, BooleanSeriesValidation -from .validation_warning import ValidationWarning -from .errors import PanSchError, PanSchArgumentError -import numpy as np -import pandas as pd +import abc import math -import typing import datetime +import pandas as pd +import numpy as np +import typing +import operator +from . import column +from .core import IndexSeriesValidation, BooleanSeriesValidation +from .validation_warning import ValidationWarning +from .errors import PanSchArgumentError +from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class IsDtypeValidation(IndexSeriesValidation): + +class CustomSeriesValidation(BooleanSeriesValidation): """ - Checks that a series has a certain numpy dtype + Validates using a user-provided function that operates on an entire series (for example by using one of the pandas + Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) """ - def __init__(self, dtype: np.dtype, **kwargs): + def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): """ - :param dtype: The numpy dtype to check the column against + :param message: The error message to provide to the user if this validation fails. The row and column and + failing value will automatically be prepended to this message, so you only have to provide a message that + describes what went wrong, for example 'failed my validation' will become + + {row: 1, column: "Column Name"}: "Value" failed my validation + :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal + to True if the object passed validation, and False if it failed """ - self.dtype = dtype - super().__init__(**kwargs) + self._validation = validation + super().__init__(message=message) - @property - def default_message(self): - return 'did not have the dtype "{}"'.format(self.dtype.name) + def select_cells(self, series: pd.Series) -> pd.Series: + return self._validation(series) + + +class CustomElementValidation(BooleanSeriesValidation): + """ + Validates using a user-provided function that operates on each element + """ + + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): + """ + :param message: The error message to provide to the user if this validation fails. The row and column and + failing value will automatically be prepended to this message, so you only have to provide a message that + describes what went wrong, for example 'failed my validation' will become - def validate_series(self, series: pd.Series): - if not series.dtype == self.dtype: - return [self.Warning(self, self.message, series, self.index, self.positional)] + {row: 1, column: "Column Name"}: "Value" failed my validation + :param validation: A function that takes the value of a data frame cell and returns True if it passes the + the validation, and false if it doesn't + """ + self._validation = validation + super().__init__(message=message) + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.apply(self._validation) class InRangeValidation(BooleanSeriesValidation): @@ -43,13 +72,36 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) + @property + def default_message(self): + return 'was not in the range [{}, {})'.format(self.min, self.max) + def select_cells(self, series: pd.Series) -> pd.Series: series = pd.to_numeric(series) return (series >= self.min) & (series < self.max) - @property - def default_message(self): - return 'was not in the range [{}, {})'.format(self.min, self.max) + +class IsDtypeValidation(IndexSeriesValidation): + """ + Checks that a series has a certain numpy dtype + """ + + def __init__(self, dtype: np.dtype, **kwargs): + """ + :param dtype: The numpy dtype to check the column against + """ + self.dtype = dtype + super().__init__(**kwargs) + + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + if not np.issubdtype(series.dtype, self.dtype): + return [ValidationWarning( + 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( + column.name if column else '', series.dtype, self.dtype + ) + )] + else: + return [] class CanCallValidation(BooleanSeriesValidation): @@ -57,9 +109,6 @@ class CanCallValidation(BooleanSeriesValidation): Validates if a given function can be called on each element in a column without raising an exception """ - def select_cells(self, series: pd.Series) -> pd.Series: - return series.apply(self.can_call) - def __init__(self, func: typing.Callable, **kwargs): """ :param func: A python function that will be called with the value of each cell in the DataFrame. If this @@ -82,6 +131,9 @@ def can_call(self, var): except: return False + def select_cells(self, series: pd.Series) -> pd.Series: + return series.apply(self.can_call) + class CanConvertValidation(CanCallValidation): """ diff --git a/test/test_validation.py b/test/test_validation.py index fef3958..0bd6623 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -390,7 +390,7 @@ def test_valid_strings(self): ) def test_invalid_strings(self): - validation = self.validator.validate(pd.Series([ + validation = self.validator.select_cells(pd.Series([ '1', '1', '3', @@ -476,7 +476,7 @@ def setUp(self): self.validator = IsDtypeValidation(np.number) def test_valid_items(self): - errors = self.validator.get_errors(pd.Series( + errors = self.validator.validate_series(pd.Series( [ 1, 2, @@ -486,7 +486,7 @@ def test_valid_items(self): self.assertEqual(len(errors), 0) def test_invalid_items(self): - errors = self.validator.get_errors(pd.Series( + errors = self.validator.validate_series(pd.Series( [ 'a', '', @@ -600,7 +600,7 @@ def setUp(self): def test_default_message(self): validator = InRangeValidation(min=4) - for error in validator.get_errors(pd.Series( + for error in validator.validate_series(pd.Series( [ 1, 2, @@ -611,7 +611,7 @@ def test_default_message(self): def test_custom_message(self): validator = InRangeValidation(min=4, message=self.message) - for error in validator.get_errors(pd.Series( + for error in validator.validate_series(pd.Series( [ 1, 2, @@ -631,17 +631,17 @@ def setUp(self): def test_in_range_allow_empty_with_error(self): validator = InRangeValidation(min=4) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): validator = InRangeValidation(min=0) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): validator = InRangeValidation(min=4) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=False)) + errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=False)) self.assertEqual(len(errors), len(self.vals)) @@ -654,21 +654,21 @@ def setUp(self): self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) def test_valid_elements(self): - errors = self.validator.get_errors(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), + errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), Column('', allow_empty=True)) self.assertEqual(len(errors), 0) def test_invalid_empty_elements(self): - errors = self.validator.get_errors(pd.Series(['aa', 'bb', 'd', None], dtype='category'), + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category'), Column('', allow_empty=False)) self.assertEqual(len(errors), 4) def test_invalid_and_empty_elements(self): - errors = self.validator.get_errors(pd.Series(['a', None], dtype='category'), + errors = self.validator.validate_series(pd.Series(['a', None], dtype='category'), Column('', allow_empty=False)) self.assertEqual(len(errors), 1) def test_invalid_elements(self): - errors = self.validator.get_errors(pd.Series(['aa', 'bb', 'd'], dtype='category'), + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category'), Column('', allow_empty=True)) self.assertEqual(len(errors), 3) From 9c6b910bd7b7a7d27c5aa971cb7585589b57294d Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 31 Jan 2020 20:54:22 +1100 Subject: [PATCH 05/13] Some miscellaneous design docs and updates --- TODO.md | 1 + UPDATE.md | 28 +++++++++++++++++++++ pandas_schema/core.py | 47 +++++++++++++----------------------- pandas_schema/validations.py | 4 ++- setup.py | 6 ++++- test/test_validation.py | 22 +++++++---------- 6 files changed, 63 insertions(+), 45 deletions(-) create mode 100755 TODO.md create mode 100755 UPDATE.md diff --git a/TODO.md b/TODO.md new file mode 100755 index 0000000..4350b8b --- /dev/null +++ b/TODO.md @@ -0,0 +1 @@ +* Add validations that apply to every column in the DF equally \ No newline at end of file diff --git a/UPDATE.md b/UPDATE.md new file mode 100755 index 0000000..80c3562 --- /dev/null +++ b/UPDATE.md @@ -0,0 +1,28 @@ +# ValidationWarnings +## Options for the ValidationWarning data +* We keep it as is, with one single ValidationWarning class that stores a `message` and a reference to the validation +that spawned it +* PREFERRED: As above, but we add a dictionary of miscellaneous kwargs to the ValidationWarning for storing stuff like the row index that failed +* We have a dataclass for each Validation type that stores things in a more structured way + * Why bother doing this if the Validation stores its own structure for the column index etc? + +## Options for the ValidationWarning message +* It's generated from the Validation as a fixed string, as it is now +* It's generated dynamically by the VW + * This means that custom messages means overriding the VW class +* PREFERRED: It's generated dynamically in the VW by calling the parent Validation with a reference to itself, e.g. + ```python + class ValidationWarning: + def __str__(self): + return self.validation.generate_message(self) + + class Validation: + def generate_message(warning: ValidationWarning) -> str: + pass + ``` + * This lets the message function use all the validation properties, and the dictionary of kwargs that it specified + * `generate_message()` will call `default_message(**kwargs)`, the dynamic class method, or `self.custom_message`, the + non-dynamic string specified by the user + * Each category of Validation will define a `create_prefix()` method, that creates the {row: 1, column: 2} prefix + that goes before each message. Thus, `generate_message()` will concatenate that with the actual message +* diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 08807b3..0a7da33 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -6,6 +6,7 @@ import typing import operator import re +from dataclasses import dataclass from . import column from .errors import PanSchArgumentError, PanSchNoIndexError @@ -25,21 +26,23 @@ def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: :return: All validation failures detected by this validation """ + def message(self, **kwargs) -> str: + pass + + @dataclass class Warning: """ Represents a difference between the schema and data frame, found during the validation of the data frame Child classes can define their own subclass of :py:class:~pandas_schema.core.BaseValidation.Warning, but need only do that if the subclass needs to store additional data. """ - - def __init__(self, validation: 'BaseValidation', message: str): - self.message = message + validation: 'BaseValidation' def __str__(self) -> str: """ The entire warning message as a string """ - return self.message + return self.validation.message() class SeriesValidation(BaseValidation): @@ -52,16 +55,7 @@ class Warning(BaseValidation.Warning): """ Represents a difference between the schema and data frame, found during the validation of the data frame """ - - def __init__(self, validation: BaseValidation, message: str, series: pd.Series): - super().__init__(validation, message) - self.series = series - - def __str__(self) -> str: - """ - The entire warning message as a string - """ - return '{} {}'.format(self.series.name, self.message) + series: pd.Series @abc.abstractmethod def select_series(self, df: pd.DataFrame) -> pd.Series: @@ -89,17 +83,8 @@ class Warning(SeriesValidation.Warning): """ Represents a difference between the schema and data frame, found during the validation of the data frame """ - - def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional): - super().__init__(validation, message, series) - self.col_index = col_index - self.positional = positional - - def __str__(self) -> str: - """ - The entire warning message as a string - """ - return 'Column {} {}'.format(self.col_index, self.message) + col_index: int + positional: bool def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, message: str = None): """ @@ -113,24 +98,26 @@ def __init__(self, index: typing.Union[int, str] = None, positional: bool = Fals self.custom_message = message @property - def message(self): + def message(self, **kwargs): """ Gets a message describing how the DataFrame cell failed the validation This shouldn't really be overridden, instead override default_message so that users can still set per-object messages :return: """ - return self.custom_message or self.default_message + if self.custom_message: + return self.custom_message() + else: + return self.default_message(**kwargs) @property - def readable_name(self): + def readable_name(self, **kwargs): """ A readable name for this validation, to be shown in validation warnings """ return type(self).__name__ - @property - def default_message(self) -> str: + def default_message(self, **kwargs) -> str: """ Create a message to be displayed whenever this validation fails This should be a generic message for the validation type, but can be overwritten if the user provides a diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index ea36595..d2d4b72 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -85,7 +85,6 @@ class IsDtypeValidation(IndexSeriesValidation): """ Checks that a series has a certain numpy dtype """ - def __init__(self, dtype: np.dtype, **kwargs): """ :param dtype: The numpy dtype to check the column against @@ -93,6 +92,9 @@ def __init__(self, dtype: np.dtype, **kwargs): self.dtype = dtype super().__init__(**kwargs) + def default_message(self) -> str: + return 'has a dtype of {} which is not a subclass of the required type {}'.format(self.dtype,) + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( diff --git a/setup.py b/setup.py index 8d8d0fd..2441567 100755 --- a/setup.py +++ b/setup.py @@ -81,7 +81,11 @@ def run(self): ], keywords='pandas csv verification schema', packages=find_packages(include=['pandas_schema']), - install_requires=['numpy', 'pandas>=0.19'], + install_requires=[ + 'numpy', + 'pandas>=0.19', + 'dataclasses' + ], cmdclass={ 'build_readme': BuildReadme, 'build_site': BuildHtmlDocs diff --git a/test/test_validation.py b/test/test_validation.py index 0bd6623..8efed36 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -606,7 +606,7 @@ def test_default_message(self): 2, 3 ] - ), Column('')): + )): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): @@ -617,7 +617,7 @@ def test_custom_message(self): 2, 3 ] - ), Column('')): + )): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') @@ -631,17 +631,17 @@ def setUp(self): def test_in_range_allow_empty_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals)) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): validator = InRangeValidation(min=0) - errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals)) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=False)) + errors = validator.validate_series(pd.Series(self.vals)) self.assertEqual(len(errors), len(self.vals)) @@ -654,21 +654,17 @@ def setUp(self): self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) def test_valid_elements(self): - errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), - Column('', allow_empty=True)) + errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) self.assertEqual(len(errors), 0) def test_invalid_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category'), - Column('', allow_empty=False)) + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) self.assertEqual(len(errors), 4) def test_invalid_and_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['a', None], dtype='category'), - Column('', allow_empty=False)) + errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) self.assertEqual(len(errors), 1) def test_invalid_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category'), - Column('', allow_empty=True)) + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) self.assertEqual(len(errors), 3) From 28e2c115371bb9bef88407b0a1dd693b0c94b020 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 2 Feb 2020 00:09:40 +1100 Subject: [PATCH 06/13] Sort out new ValidationWarning structure --- pandas_schema/core.py | 60 ++++++++--------------------- pandas_schema/validation_warning.py | 37 ++++++++++-------- pandas_schema/validations.py | 23 ++++++----- test/test_validation.py | 1 + 4 files changed, 50 insertions(+), 71 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 0a7da33..76cf323 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -10,6 +10,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError +from pandas_schema.validation_warning import ValidationWarning from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -19,44 +20,23 @@ class BaseValidation(abc.ABC): """ @abc.abstractmethod - def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ Validates a data frame :param df: Data frame to validate :return: All validation failures detected by this validation """ - def message(self, **kwargs) -> str: + def message(self, warning: ValidationWarning) -> str: pass - @dataclass - class Warning: - """ - Represents a difference between the schema and data frame, found during the validation of the data frame - Child classes can define their own subclass of :py:class:~pandas_schema.core.BaseValidation.Warning, but - need only do that if the subclass needs to store additional data. - """ - validation: 'BaseValidation' - - def __str__(self) -> str: - """ - The entire warning message as a string - """ - return self.validation.message() - class SeriesValidation(BaseValidation): """ - A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation - to it + A _SeriesValidation validates a DataFrame by selecting a single series from it, and + applying some validation to it """ - class Warning(BaseValidation.Warning): - """ - Represents a difference between the schema and data frame, found during the validation of the data frame - """ - series: pd.Series - @abc.abstractmethod def select_series(self, df: pd.DataFrame) -> pd.Series: """ @@ -64,12 +44,12 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: """ @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: """ Validate a single series """ - def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: series = self.select_series(df) return self.validate_series(series) @@ -79,14 +59,9 @@ class IndexSeriesValidation(SeriesValidation): Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation or later """ - class Warning(SeriesValidation.Warning): - """ - Represents a difference between the schema and data frame, found during the validation of the data frame - """ - col_index: int - positional: bool - def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, message: str = None): + def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, + message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series @@ -138,7 +113,7 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return df.loc[self.index] @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: pass @@ -150,14 +125,6 @@ class BooleanSeriesValidation(IndexSeriesValidation): Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, because the data is in the same form for each cell. You need only define a :py:meth~default_message. """ - class Warning(IndexSeriesValidation.Warning): - def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional, row_index, value): - super().__init__(validation, message, series, col_index, positional) - self.row_index = row_index - self.value = value - - def __str__(self) -> str: - return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row_index, self.col_index, self.value, self.message) @abc.abstractmethod def select_cells(self, series: pd.Series) -> pd.Series: @@ -168,9 +135,12 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: indices = self.select_cells(series) cells = series[indices] return ( - Warning(self, self.message, series, self.index, self.positional, row_idx, cell) for row_idx, cell in cells.items() + ValidationWarning(self, { + 'row': row_idx, + 'value': cell + }) for row_idx, cell in cells.items() ) diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 320be65..3eec3db 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -1,22 +1,25 @@ +import pandas_schema +from dataclasses import dataclass, field + + +@dataclass class ValidationWarning: """ - Represents a difference between the schema and data frame, found during the validation of the data frame + Represents a difference between the schema and data frame, found during the validation + of the data frame + """ + validation: 'pandas_schema.BaseValidation' + """ + The validation that spawned this warning """ - def __init__(self, message: str, value: str = None, row: int = -1, column: str = None): - self.message = message - self.value = value - """The value of the failing cell in the DataFrame""" - self.row = row - """The row index (usually an integer starting from 0) of the cell that failed the validation""" - self.column = column - """The column name of the cell that failed the validation""" + props: dict = field(default_factory=dict) + """ + List of data about this warning in addition to that provided by the validation, for + example, if a cell in the DataFrame didn't match the validation, the props might + include a `value` key, for storing what the actual value was + """ - def __str__(self) -> str: - """ - The entire warning message as a string - """ - if self.row is not None and self.column is not None and self.value is not None: - return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row, self.column, self.value, self.message) - else: - return self.message + @property + def message(self): + return self.validation.message(self) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index d2d4b72..442f237 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -41,7 +41,8 @@ class CustomElementValidation(BooleanSeriesValidation): Validates using a user-provided function that operates on each element """ - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], + message: str): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -85,22 +86,23 @@ class IsDtypeValidation(IndexSeriesValidation): """ Checks that a series has a certain numpy dtype """ + def __init__(self, dtype: np.dtype, **kwargs): """ :param dtype: The numpy dtype to check the column against """ - self.dtype = dtype super().__init__(**kwargs) + self.dtype = dtype - def default_message(self) -> str: - return 'has a dtype of {} which is not a subclass of the required type {}'.format(self.dtype,) + def default_message(self, validation) -> str: + return 'has a dtype of {} which is not a subclass of the required type {}'.format( + self.dtype, validation.props['dtype']) def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( - 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype - ) + self, + {'dtype': series.dtype} )] else: return [] @@ -119,12 +121,15 @@ def __init__(self, func: typing.Callable, **kwargs): if callable(type): self.callable = func else: - raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) + raise PanSchArgumentError( + 'The object "{}" passed to CanCallValidation is not callable!'.format( + type)) super().__init__(**kwargs) @property def default_message(self): - return 'raised an exception when the callable {} was called on it'.format(self.callable) + return 'raised an exception when the callable {} was called on it'.format( + self.callable) def can_call(self, var): try: diff --git a/test/test_validation.py b/test/test_validation.py index 8efed36..dbea90f 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,6 +10,7 @@ from pandas_schema.validations import * from pandas_schema.core import BooleanSeriesValidation +from pandas_schema.schema import Schema from pandas_schema import ValidationWarning From 04893b3c0c39ef171b8f301f3298d76a2f20dbf0 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 4 Feb 2020 01:16:04 +1100 Subject: [PATCH 07/13] Add indexer class, solidify message format and ValidationWarning --- pandas_schema/column.py | 78 +++++++++++++++++++------------ pandas_schema/core.py | 31 ++++++------ pandas_schema/errors.py | 14 +++++- pandas_schema/index.py | 68 +++++++++++++++++++++++++++ pandas_schema/schema.py | 91 +++++++++--------------------------- pandas_schema/validations.py | 30 +++++------- test/test_validation.py | 35 +++++++------- 7 files changed, 195 insertions(+), 152 deletions(-) create mode 100755 pandas_schema/index.py diff --git a/pandas_schema/column.py b/pandas_schema/column.py index e0df39a..048907c 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,51 +1,67 @@ import typing -import pandas as pd import pandas_schema.core -from .validation_warning import ValidationWarning +from pandas_schema.index import PandasIndexer -def _column( + +def column( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: typing.Union[int, str] = None, - position: bool = False + index: PandasIndexer = None, + override: bool = False ): """ A utility method for setting the index data on a set of Validations :param validations: A list of validations to modify :param index: The index of the series that these validations will now consider - :param position: If true, these validations use positional indexing. + :param override: If true, override existing index values. Otherwise keep the existing ones See :py:class:`pandas_schema.validation.IndexSeriesValidation` """ for valid in validations: - valid.index = index - valid.position = position + if override or valid.index is None: + valid.index = index -def label_column( - validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: typing.Union[int, str], -): - """ - A utility method for setting the label-based column for each validation - :param validations: A list of validations to modify - :param index: The label of the series that these validations will now consider - """ - return _column( - validations, - index, - position=False - ) -def positional_column( +def column_sequence( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: int, + override: bool = False ): """ - A utility method for setting the position-based column for each validation + A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so + that the first validation gets index 0, the second gets index 1 etc. Note: this will not modify any index that + already has some kind of index :param validations: A list of validations to modify - :param index: The index of the series that these validations will now consider + :param override: If true, override existing index values. Otherwise keep the existing ones """ - return _column( - validations, - index, - position=True - ) + for i, valid in validations: + if override or valid.index is None: + valid.index = PandasIndexer(i, typ='positional') +# +# def label_column( +# validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], +# index: typing.Union[int, str], +# ): +# """ +# A utility method for setting the label-based column for each validation +# :param validations: A list of validations to modify +# :param index: The label of the series that these validations will now consider +# """ +# return _column( +# validations, +# index, +# position=False +# ) +# +# def positional_column( +# validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], +# index: int, +# ): +# """ +# A utility method for setting the position-based column for each validation +# :param validations: A list of validations to modify +# :param index: The index of the series that these validations will now consider +# """ +# return _column( +# validations, +# index, +# position=True + diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 76cf323..54ce0b6 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -11,6 +11,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning +from pandas_schema.index import PandasIndexer from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -60,30 +61,33 @@ class IndexSeriesValidation(SeriesValidation): or later """ - def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, - message: str = None): + def __init__(self, index: PandasIndexer = None, message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series - :param positional: If true, the index is a position along the axis (ie, index=0 indicates the first element). Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ self.index = index - self.positional = positional self.custom_message = message - @property - def message(self, **kwargs): + def message(self, warning: ValidationWarning): """ Gets a message describing how the DataFrame cell failed the validation This shouldn't really be overridden, instead override default_message so that users can still set per-object messages :return: """ + if self.index.type == 'position': + prefix = self.index.index + else: + prefix = '"{}"'.format(self.index.index) + if self.custom_message: - return self.custom_message() + suffix = self.custom_message else: - return self.default_message(**kwargs) + suffix = self.default_message(warning) + + return "Column {} {}".format(prefix, suffix) @property def readable_name(self, **kwargs): @@ -92,7 +96,7 @@ def readable_name(self, **kwargs): """ return type(self).__name__ - def default_message(self, **kwargs) -> str: + def default_message(self, warning: ValidationWarning) -> str: """ Create a message to be displayed whenever this validation fails This should be a generic message for the validation type, but can be overwritten if the user provides a @@ -107,10 +111,7 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: if self.index is None: raise PanSchNoIndexError() - if self.positional: - return df.iloc[self.index] - else: - return df.loc[self.index] + return self.index(df) @abc.abstractmethod def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: @@ -136,8 +137,8 @@ def select_cells(self, series: pd.Series) -> pd.Series: pass def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - indices = self.select_cells(series) - cells = series[indices] + failed = ~self.select_cells(series) + cells = series[failed] return ( ValidationWarning(self, { 'row': row_idx, diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py index ab5e73d..cdc3132 100755 --- a/pandas_schema/errors.py +++ b/pandas_schema/errors.py @@ -1,8 +1,20 @@ -class PanSchError(BaseException): +class PanSchError(Exception): """ Base class for all pandas_schema exceptions """ + def __init__(self, message=None): + super().__init__(message) + + +class PanSchIndexError(PanSchError): + """ + Some issue with creating a PandasIndexer + """ + + def __init__(self, message): + super().__init__(message=message) + class PanSchInvalidSchemaError(PanSchError): """ diff --git a/pandas_schema/index.py b/pandas_schema/index.py new file mode 100755 index 0000000..d37cd91 --- /dev/null +++ b/pandas_schema/index.py @@ -0,0 +1,68 @@ +from pandas_schema.errors import PanSchIndexError +from dataclasses import dataclass +from typing import Union +import numpy +import pandas + +IndexValue = Union[numpy.string_, numpy.int_, str, int] +""" +A pandas index can either be an integer or string, or an array of either. This typing is a bit sketchy because really +a lot of things are accepted here +""" + + +class PandasIndexer: + """ + An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` + """ + + valid_types = {'position', 'label'} + index: IndexValue + """ + The index to use, either an integer for position-based indexing, or a string for label-based indexing + """ + type: str + """ + The type of indexing to use, either 'position' or 'label' + """ + + def __init__(self, index: IndexValue, typ: str = None): + self.index = index + + if typ is not None: + # If the type is provided, validate it + if typ not in self.valid_types: + raise PanSchIndexError('The index type was not one of {}'.format(' or '.join(self.valid_types))) + else: + self.type = typ + else: + # If the type isn't provided, guess it based on the datatype of the index + if numpy.issubdtype(type(index), numpy.character): + self.type = 'label' + elif numpy.issubdtype(type(index), numpy.int_): + self.type = 'position' + else: + raise PanSchIndexError('The index value was not either an integer or string, or an array of either of ' + 'these') + + + def __call__(self, df: pandas.DataFrame, axis: int = 0): + """ + Apply this index + :param df: The DataFrame to index + :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column + """ + if self.type == 'label': + return df.loc(axis=axis)[self.index] + elif self.type == 'label': + return df.iloc(axis=axis)[self.index] + + +class RowIndexer(PandasIndexer): + def __call__(self, df: pandas.DataFrame): + return super().__call__(df, axis=0) + + +class ColumnIndexer(PandasIndexer): + def __call__(self, df: pandas.DataFrame): + return super().__call__(df, axis=1) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index da27d81..83ad9c5 100755 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -1,8 +1,10 @@ import pandas as pd import typing -from .errors import PanSchInvalidSchemaError, PanSchArgumentError -from .validation_warning import ValidationWarning +from pandas_schema.core import BaseValidation +from pandas_schema.errors import PanSchArgumentError, PanSchInvalidSchemaError +from pandas_schema.validation_warning import ValidationWarning +from pandas_schema.index import PandasIndexer class Schema: @@ -10,83 +12,32 @@ class Schema: A schema that defines the columns required in the target DataFrame """ - def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): + def __init__(self, validations: typing.Iterable[BaseValidation]): """ - :param columns: A list of column objects - :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring - the header names. False if the columns should be associated by column header names only. Defaults to False + :param validations: A list of validations that will be applied to the DataFrame upon validation """ - if not columns: - raise PanSchInvalidSchemaError('An instance of the schema class must have a columns list') + if not validations: + raise PanSchInvalidSchemaError('An instance of the schema class must have a validations list') - if not isinstance(columns, typing.List): - raise PanSchInvalidSchemaError('The columns field must be a list of Column objects') + if not isinstance(validations, typing.Iterable): + raise PanSchInvalidSchemaError('The columns field must be an iterable of Validation objects') - if not isinstance(ordered, bool): - raise PanSchInvalidSchemaError('The ordered field must be a boolean') + self.validations = list(validations) - self.columns = list(columns) - self.ordered = ordered - - def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing.List[ValidationWarning]: + def validate(self, df: pd.DataFrame, subset: PandasIndexer = None) -> typing.List[ValidationWarning]: """ Runs a full validation of the target DataFrame using the internal columns list :param df: A pandas DataFrame to validate - :param columns: A list of columns indicating a subset of the schema that we want to validate + :param subset: A list of columns indicating a subset of the schema that we want to validate. Can be any :return: A list of ValidationWarning objects that list the ways in which the DataFrame was invalid """ - errors = [] - df_cols = len(df.columns) - - # If no columns are passed, validate against every column in the schema. This is the default behaviour - if columns is None: - schema_cols = len(self.columns) - columns_to_pair = self.columns - if df_cols != schema_cols: - errors.append( - ValidationWarning( - 'Invalid number of columns. The schema specifies {}, but the data frame has {}'.format( - schema_cols, - df_cols) - ) - ) - return errors - - # If we did pass in columns, check that they are part of the current schema - else: - if set(columns).issubset(self.get_column_names()): - columns_to_pair = [column for column in self.columns if column.name in columns] - else: - raise PanSchArgumentError( - 'Columns {} passed in are not part of the schema'.format(set(columns).difference(self.columns)) - ) - - # We associate the column objects in the schema with data frame series either by name or by position, depending - # on the value of self.ordered - if self.ordered: - series = [x[1] for x in df.iteritems()] - column_pairs = zip(series, self.columns) - else: - column_pairs = [] - for column in columns_to_pair: - - # Throw an error if the schema column isn't in the data frame - if column.name not in df: - errors.append(ValidationWarning( - 'The column {} exists in the schema but not in the data frame'.format(column.name))) - return errors + # Apply the subset if we have one + if subset is not None: + df = subset(df) - column_pairs.append((df[column.name], column)) - - # Iterate over each pair of schema columns and data frame series and run validations - for series, column in column_pairs: - errors += column.validate(series) - - return sorted(errors, key=lambda e: e.row) - - def get_column_names(self): - """ - Returns the column names contained in the schema - """ - return [column.name for column in self.columns] + # Build the list of errors + errors = [] + for validation in self.validations: + errors.extend(validation.validate(df)) + return errors diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 442f237..d05e180 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -94,11 +94,11 @@ def __init__(self, dtype: np.dtype, **kwargs): super().__init__(**kwargs) self.dtype = dtype - def default_message(self, validation) -> str: + def default_message(self, warning: ValidationWarning) -> str: return 'has a dtype of {} which is not a subclass of the required type {}'.format( - self.dtype, validation.props['dtype']) + self.dtype, warning.props['dtype']) - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( self, @@ -126,8 +126,7 @@ def __init__(self, func: typing.Callable, **kwargs): type)) super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'raised an exception when the callable {} was called on it'.format( self.callable) @@ -162,8 +161,7 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'cannot be converted to type {}'.format(self.callable) @@ -182,8 +180,7 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the pattern "{}"'.format(self.pattern) def select_cells(self, series: pd.Series) -> pd.Series: @@ -198,8 +195,7 @@ class TrailingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains trailing whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -214,8 +210,7 @@ class LeadingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains leading whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -230,8 +225,7 @@ class IsDistinctValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains values that are not unique' def select_cells(self, series: pd.Series) -> pd.Series: @@ -252,8 +246,7 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) @@ -278,8 +271,7 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): diff --git a/test/test_validation.py b/test/test_validation.py index dbea90f..c02615e 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -6,11 +6,14 @@ import re from numpy import nan, dtype +import numpy as np import pandas as pd from pandas_schema.validations import * from pandas_schema.core import BooleanSeriesValidation +from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema +from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning @@ -510,20 +513,20 @@ def test_schema(self): }) schema = Schema([ - Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), - Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), - Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), + IsDtypeValidation(dtype('int64'), index=ci('wrong_dtype1')), + IsDtypeValidation(dtype('float64'), index=ci('wrong_dtype2')), + IsDtypeValidation(dtype('int64'), index=ci('wrong_dtype3')), ]) errors = schema.validate(df) self.assertEqual( - sorted([str(x) for x in errors]), - sorted([ - 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', - 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', - 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' - ]) + [x.props for x in errors], + [ + {'dtype': np.object}, + {'dtype': np.int64}, + {'dtype': np.float64}, + ] ) @@ -632,17 +635,17 @@ def setUp(self): def test_in_range_allow_empty_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals)) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): validator = InRangeValidation(min=0) - errors = validator.validate_series(pd.Series(self.vals)) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals)) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), len(self.vals)) @@ -656,16 +659,16 @@ def setUp(self): def test_valid_elements(self): errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) - self.assertEqual(len(errors), 0) + self.assertEqual(len(list(errors)), 0) def test_invalid_empty_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) - self.assertEqual(len(errors), 4) + self.assertEqual(len(list(errors)), 4) def test_invalid_and_empty_elements(self): errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) - self.assertEqual(len(errors), 1) + self.assertEqual(len(list(errors)), 1) def test_invalid_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) - self.assertEqual(len(errors), 3) + self.assertEqual(len(list(errors)), 3) From 7d8aa932ea09b2be3ad758ded34f2e653a08fe0d Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 6 Feb 2020 01:09:51 +1100 Subject: [PATCH 08/13] First attempt at CombinedValidations in the new API --- TODO.md | 6 +++++- pandas_schema/column.py | 4 +++- pandas_schema/core.py | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/TODO.md b/TODO.md index 4350b8b..0a6c2c6 100755 --- a/TODO.md +++ b/TODO.md @@ -1 +1,5 @@ -* Add validations that apply to every column in the DF equally \ No newline at end of file +* [ ] Add validations that apply to every column in the DF equally +* [x] Fix CombinedValidations +* [x] Add replacement for allow_empty Columns +* [ ] New column() tests +* [ ] New CombinedValidation tests \ No newline at end of file diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 048907c..ab3b58a 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -7,13 +7,15 @@ def column( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: PandasIndexer = None, - override: bool = False + override: bool = False, + allow_empty=False ): """ A utility method for setting the index data on a set of Validations :param validations: A list of validations to modify :param index: The index of the series that these validations will now consider :param override: If true, override existing index values. Otherwise keep the existing ones + :param allow_empty: Allow empty rows (NaN) to pass the validation See :py:class:`pandas_schema.validation.IndexSeriesValidation` """ for valid in validations: diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 54ce0b6..6c4bf99 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -34,7 +34,7 @@ def message(self, warning: ValidationWarning) -> str: class SeriesValidation(BaseValidation): """ - A _SeriesValidation validates a DataFrame by selecting a single series from it, and + A SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation to it """ @@ -145,3 +145,37 @@ def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarnin 'value': cell }) for row_idx, cell in cells.items() ) + + +class CombinedValidation(BaseValidation): + """ + Validates if one and/or the other validation is true for an element + """ + + def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator, + message: str): + super().__init__(message=message) + self.operator = operator + self.v_a = validation_a + self.v_b = validation_b + + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + # Let both validations separately select and filter a column + left_series = self.v_a.select_series(df) + right_series = self.v_a.select_series(df) + + left_failed = ~self.v_a.select_cells(left_series) + right_failed = ~self.v_b.select_cells(right_series) + + # Then, we combine the two resulting boolean series, and determine the row indices of the result + failed = self.operator(left_failed, right_failed) + + return ( + ValidationWarning(self, { + 'row': row_idx, + }) for row_idx in np.where(failed) + ) + + @property + def default_message(self): + return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) From c36761ae148fe90dfd900fe161e9135208541e66 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 16 Feb 2020 17:29:35 +1100 Subject: [PATCH 09/13] Rework CombinedValidations --- pandas_schema/core.py | 66 +++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 6c4bf99..1d21ed6 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -1,6 +1,7 @@ import abc import math import datetime +from itertools import chain import pandas as pd import numpy as np import typing @@ -136,15 +137,36 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + # def generate_warnings(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + # """ + # Given a series that has been sliced down to only those that definitely failed, produce a list of + # ValidationWarnings. + # Note, this is different to validate_series, which actually calculates which rows have failed. + # Having this as a separate method allows it to be accessed by the CombinedValidation + # + # :param series: A series that has been sliced down to only those that definitely failed + # """ + # return ( + # ValidationWarning(self, { + # 'row': row_idx, + # 'value': cell + # }) for row_idx, cell in series.items() + # ) + + def warning_series(self, series): failed = ~self.select_cells(series) - cells = series[failed] - return ( - ValidationWarning(self, { - 'row': row_idx, - 'value': cell - }) for row_idx, cell in cells.items() - ) + + # Slice out the failed items, then map each into a list of validation warnings at each respective index + return series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { + 'row': row.name, + 'value': row[0] + })], axis='columns') + + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + warnings = self.warning_series(series) + + # Remove the empty elements, split the list of warnings in each cell, and then compile that into a list + return warnings.dropna().explode().tolist() class CombinedValidation(BaseValidation): @@ -156,25 +178,27 @@ def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanS message: str): super().__init__(message=message) self.operator = operator - self.v_a = validation_a - self.v_b = validation_b + self.left = validation_a + self.right = validation_b def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: # Let both validations separately select and filter a column - left_series = self.v_a.select_series(df) - right_series = self.v_a.select_series(df) + left_series = self.left.select_series(df) + right_series = self.right.select_series(df) - left_failed = ~self.v_a.select_cells(left_series) - right_failed = ~self.v_b.select_cells(right_series) + left_errors = self.left.warning_series(left_series) + right_errors = self.right.warning_series(right_series) - # Then, we combine the two resulting boolean series, and determine the row indices of the result - failed = self.operator(left_failed, right_failed) + # TODO - return ( - ValidationWarning(self, { - 'row': row_idx, - }) for row_idx in np.where(failed) - ) + # Then, we combine the two resulting boolean series, and determine the row indices of the result + # failed = self.operator(left_errors, right_errors) + # + # # If they did fail, obtain warnings from the validation that caused it + # return chain( + # self.v_a.generate_warnings(left_series[left_failed & failed]), + # self.v_b.generate_warnings(right_series[right_failed & failed]), + # ) @property def default_message(self): From 9bd2704d35ba3d9ecc5f533dbd180ed5f69cb562 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 25 Feb 2020 00:36:42 +1100 Subject: [PATCH 10/13] More work --- UPDATE.md | 5 +++ pandas_schema/core.py | 88 ++++++++++++++++++++++--------------------- test/test_column.py | 68 --------------------------------- 3 files changed, 51 insertions(+), 110 deletions(-) delete mode 100755 test/test_column.py diff --git a/UPDATE.md b/UPDATE.md index 80c3562..5bbc15f 100755 --- a/UPDATE.md +++ b/UPDATE.md @@ -26,3 +26,8 @@ that spawned it * Each category of Validation will define a `create_prefix()` method, that creates the {row: 1, column: 2} prefix that goes before each message. Thus, `generate_message()` will concatenate that with the actual message * + +## Options for placing CombinedValidation in the inheritance hierarchy +* In order to make both CombinedValidation and BooleanSeriesValidation both share a class, so they can be chained together, +either we had to make a mixin that creates a "side path" that doesn't call `validate` (in this case, `validate_with_series`), +or we diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 1d21ed6..982a480 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -119,7 +119,24 @@ def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarnin pass -class BooleanSeriesValidation(IndexSeriesValidation): +class WarningSeriesGenerator(BaseValidation, abc.ABC): + """ + Mixin class that indicates that this Validation can produce a "warning series", which is a pandas Series with one + or more warnings in each cell, corresponding to warnings detected in the DataFrame at the same index + """ + + @abc.abstractmethod + def get_warning_series(self, df: pd.DataFrame) -> pd.Series: + """ + Return a series of ValidationWarnings, not an iterable of ValidationWarnings like the normal validate() method + """ + + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + warnings = self.get_warning_series(df) + return warnings.dropna().explode().tolist() + + +class BooleanSeriesValidation(IndexSeriesValidation, WarningSeriesGenerator): """ Validation is defined by the function :py:meth:~select_cells that returns a boolean series. Each cell that has False has failed the validation. @@ -137,68 +154,55 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - # def generate_warnings(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - # """ - # Given a series that has been sliced down to only those that definitely failed, produce a list of - # ValidationWarnings. - # Note, this is different to validate_series, which actually calculates which rows have failed. - # Having this as a separate method allows it to be accessed by the CombinedValidation - # - # :param series: A series that has been sliced down to only those that definitely failed - # """ - # return ( - # ValidationWarning(self, { - # 'row': row_idx, - # 'value': cell - # }) for row_idx, cell in series.items() - # ) - - def warning_series(self, series): + def get_warning_series(self, series) -> pd.Series: + """ + Validates a series and returns a series of warnings. + This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series + :param series: The series to validate + """ failed = ~self.select_cells(series) # Slice out the failed items, then map each into a list of validation warnings at each respective index return series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { - 'row': row.name, - 'value': row[0] - })], axis='columns') - - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - warnings = self.warning_series(series) - - # Remove the empty elements, split the list of warnings in each cell, and then compile that into a list - return warnings.dropna().explode().tolist() + 'row': row.name, + 'value': row[0] + })], axis='columns') -class CombinedValidation(BaseValidation): +class CombinedValidation(WarningSeriesGenerator): """ Validates if one and/or the other validation is true for an element """ - def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator, + def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator: str, message: str): super().__init__(message=message) self.operator = operator self.left = validation_a self.right = validation_b - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def get_warning_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column left_series = self.left.select_series(df) right_series = self.right.select_series(df) - left_errors = self.left.warning_series(left_series) - right_errors = self.right.warning_series(right_series) - - # TODO + left_errors = self.left.get_warning_series(left_series) + right_errors = self.right.get_warning_series(right_series) + + if self.operator == 'and': + # If it's an "and" validation, left, right, or both failing means an error, so we can simply concatenate + # the lists of errors + combined = left_errors.combine(right_errors, func=operator.add) + elif self.operator == 'or': + # [error] and [] = [] + # [error_1] and [error_2] = [error_2] + # [] and [] = [] + # Thus, we can use the and operator to implement "or" validations + combined = left_errors.combine(right_errors, func=operator.and_)#func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) + else: + raise Exception('Operator must be "and" or "or"') - # Then, we combine the two resulting boolean series, and determine the row indices of the result - # failed = self.operator(left_errors, right_errors) - # - # # If they did fail, obtain warnings from the validation that caused it - # return chain( - # self.v_a.generate_warnings(left_series[left_failed & failed]), - # self.v_b.generate_warnings(right_series[right_failed & failed]), - # ) + return combined @property def default_message(self): diff --git a/test/test_column.py b/test/test_column.py deleted file mode 100755 index 38e61f0..0000000 --- a/test/test_column.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest -import pandas as pd - -from pandas_schema import Column -from pandas_schema.validation import CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation - - -class SingleValidationColumn(unittest.TestCase): - """ - Test a column with one single validation - """ - NAME = 'col1' - - col = Column(NAME, [CanConvertValidation(int)], allow_empty=False) - ser = pd.Series([ - 'a', - 'b', - 'c' - ]) - - def test_name(self): - self.assertEqual(self.col.name, self.NAME, 'A Column does not store its name correctly') - - def test_outputs(self): - results = self.col.validate(self.ser) - - self.assertEqual(len(results), len(self.ser), 'A Column produces the wrong number of errors') - for i in range(2): - self.assertTrue(any([r.row == i for r in results]), 'A Column does not report errors for every row') - - -class DoubleValidationColumn(unittest.TestCase): - """ - Test a column with two different validations - """ - NAME = 'col1' - - col = Column(NAME, [TrailingWhitespaceValidation(), LeadingWhitespaceValidation()], allow_empty=False) - ser = pd.Series([ - ' a ', - ' b ', - ' c ' - ]) - - def test_outputs(self): - results = self.col.validate(self.ser) - - # There should be 6 errors, 2 for each row - self.assertEqual(len(results), 2 * len(self.ser), 'A Column produces the wrong number of errors') - for i in range(2): - in_row = [r for r in results if r.row == i] - self.assertEqual(len(in_row), 2, 'A Column does not report both errors for every row') - - -class AllowEmptyColumn(unittest.TestCase): - """ - Test a column with one single validation that allows empty columns - """ - NAME = 'col1' - - col = Column(NAME, [CanConvertValidation(int)], allow_empty=True) - ser = pd.Series([ - '', - ]) - - def test_outputs(self): - results = self.col.validate(self.ser) - self.assertEqual(len(results), 0, 'allow_empty is not allowing empty columns') From f502167932b39597676b5b828a36f3ebffec39d3 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 17 Mar 2020 00:05:31 +1100 Subject: [PATCH 11/13] Fix more tests --- pandas_schema/core.py | 176 ++++++----- pandas_schema/index.py | 41 ++- pandas_schema/validations.py | 38 ++- test/test_validation.py | 558 ++++++++++++++--------------------- 4 files changed, 387 insertions(+), 426 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 982a480..a7963a6 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -12,7 +12,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer +from pandas_schema.index import PandasIndexer, IndexValue from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -29,64 +29,37 @@ def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: :return: All validation failures detected by this validation """ + @abc.abstractmethod def message(self, warning: ValidationWarning) -> str: pass -class SeriesValidation(BaseValidation): +class IndexValidation(BaseValidation): """ - A SeriesValidation validates a DataFrame by selecting a single series from it, and - applying some validation to it + Mixin for Validation classes, giving them access to an index for selecting a Series out of the DataFrame """ - @abc.abstractmethod - def select_series(self, df: pd.DataFrame) -> pd.Series: - """ - Selects a series from the DataFrame that will be validated - """ - - @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - """ - Validate a single series - """ - - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: - series = self.select_series(df) - return self.validate_series(series) - - -class IndexSeriesValidation(SeriesValidation): - """ - Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation - or later - """ - - def __init__(self, index: PandasIndexer = None, message: str = None): + def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str = None, **kwargs): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.index = index + super().__init__(**kwargs) + if isinstance(index, PandasIndexer): + self.index = index + else: + # If it isn't already an indexer object, convert it to one + self.index = PandasIndexer(index=index) self.custom_message = message - def message(self, warning: ValidationWarning): - """ - Gets a message describing how the DataFrame cell failed the validation - This shouldn't really be overridden, instead override default_message so that users can still set per-object - messages - :return: - """ - if self.index.type == 'position': - prefix = self.index.index - else: - prefix = '"{}"'.format(self.index.index) + def message(self, warning: ValidationWarning) -> str: + prefix = self.prefix() if self.custom_message: suffix = self.custom_message else: - suffix = self.default_message(warning) + suffix = self.default_message return "Column {} {}".format(prefix, suffix) @@ -97,12 +70,7 @@ def readable_name(self, **kwargs): """ return type(self).__name__ - def default_message(self, warning: ValidationWarning) -> str: - """ - Create a message to be displayed whenever this validation fails - This should be a generic message for the validation type, but can be overwritten if the user provides a - message kwarg - """ + def default_message(self) -> str: return 'failed the {}'.format(self.readable_name) def select_series(self, df: pd.DataFrame) -> pd.Series: @@ -114,6 +82,53 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return self.index(df) + def prefix(self): + """ + Return a string that could be used to prefix a message that relates to this index + """ + if self.index is None: + return "" + + if self.index.type == 'position': + return self.index.index + else: + return '"{}"'.format(self.index.index) + + +# +# class SeriesValidation(BaseValidation): +# """ +# A SeriesValidation validates a DataFrame by selecting a single series from it, and +# applying some validation to it +# """ +# +# @abc.abstractmethod +# def select_series(self, df: pd.DataFrame) -> pd.Series: +# """ +# Selects a series from the DataFrame that will be validated +# """ +# +# @abc.abstractmethod +# def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: +# """ +# Validate a single series +# """ +# +# def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: +# series = self.select_series(df) +# return self.validate_series(series) + + +class SeriesValidation(IndexValidation): + """ + A SeriesValidation validates a DataFrame by selecting a single series from it, and + applying some validation to it + """ + + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + series = self.index(df) + return self.validate_series(series) + @abc.abstractmethod def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: pass @@ -131,12 +146,25 @@ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: Return a series of ValidationWarnings, not an iterable of ValidationWarnings like the normal validate() method """ - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + @staticmethod + def flatten_warning_series(warnings: pd.Series): + """ + Converts a warning series into an iterable of warnings + """ + return warnings[warnings.astype(bool)].explode().tolist() + + def validate(self, df: pd.DataFrame, flatten=True) -> typing.Union[ + typing.Iterable[ValidationWarning], + pd.Series + ]: warnings = self.get_warning_series(df) - return warnings.dropna().explode().tolist() + if flatten: + return self.flatten_warning_series(warnings) + else: + return warnings -class BooleanSeriesValidation(IndexSeriesValidation, WarningSeriesGenerator): +class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): """ Validation is defined by the function :py:meth:~select_cells that returns a boolean series. Each cell that has False has failed the validation. @@ -154,19 +182,35 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - def get_warning_series(self, series) -> pd.Series: + def validate_series(self, series, flatten=True) -> typing.Union[ + typing.Iterable[ValidationWarning], + pd.Series + ]: """ - Validates a series and returns a series of warnings. - This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series - :param series: The series to validate + Utility method for shortcutting data-frame validation and instead validating only a single series """ failed = ~self.select_cells(series) # Slice out the failed items, then map each into a list of validation warnings at each respective index - return series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { + warnings = series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { 'row': row.name, 'value': row[0] - })], axis='columns') + })], axis='columns', result_type='reduce') + # warnings = warnings.iloc[:, 0] + + if flatten: + return self.flatten_warning_series(warnings) + else: + return warnings + + def get_warning_series(self, df: pd.DataFrame) -> pd.Series: + """ + Validates a series and returns a series of warnings. + This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series + :param series: The series to validate + """ + series = self.select_series(df) + return self.validate_series(series, flatten=False) class CombinedValidation(WarningSeriesGenerator): @@ -174,31 +218,31 @@ class CombinedValidation(WarningSeriesGenerator): Validates if one and/or the other validation is true for an element """ - def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator: str, - message: str): - super().__init__(message=message) + def message(self, warning: ValidationWarning) -> str: + pass + + def __init__(self, validation_a: WarningSeriesGenerator, validation_b: WarningSeriesGenerator, operator: str): + super().__init__() self.operator = operator self.left = validation_a self.right = validation_b def get_warning_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column - left_series = self.left.select_series(df) - right_series = self.right.select_series(df) - - left_errors = self.left.get_warning_series(left_series) - right_errors = self.right.get_warning_series(right_series) + left_errors = self.left.validate(df, flatten=False) + right_errors = self.right.validate(df, flatten=False) if self.operator == 'and': # If it's an "and" validation, left, right, or both failing means an error, so we can simply concatenate # the lists of errors - combined = left_errors.combine(right_errors, func=operator.add) + combined = left_errors.combine(right_errors, func=operator.add, fill_value=[]) elif self.operator == 'or': # [error] and [] = [] # [error_1] and [error_2] = [error_2] # [] and [] = [] # Thus, we can use the and operator to implement "or" validations - combined = left_errors.combine(right_errors, func=operator.and_)#func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) + combined = left_errors.combine(right_errors, func=lambda l, r: l + r if l and r else [], fill_value=[]) + # func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) else: raise Exception('Operator must be "and" or "or"') diff --git a/pandas_schema/index.py b/pandas_schema/index.py index d37cd91..51f1172 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -3,6 +3,7 @@ from typing import Union import numpy import pandas +from enum import Enum IndexValue = Union[numpy.string_, numpy.int_, str, int] """ @@ -11,23 +12,34 @@ """ +class IndexType(Enum): + POSITION = 0 + LABEL = 1 + + class PandasIndexer: """ An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` """ - valid_types = {'position', 'label'} + # valid_types = {'position', 'label'} index: IndexValue """ The index to use, either an integer for position-based indexing, or a string for label-based indexing """ - type: str + type: IndexType """ The type of indexing to use, either 'position' or 'label' """ - def __init__(self, index: IndexValue, typ: str = None): + axis: int + """ + The axis for the indexer + """ + + def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): self.index = index + self.axis = axis if typ is not None: # If the type is provided, validate it @@ -38,31 +50,30 @@ def __init__(self, index: IndexValue, typ: str = None): else: # If the type isn't provided, guess it based on the datatype of the index if numpy.issubdtype(type(index), numpy.character): - self.type = 'label' + self.type = IndexType.LABEL elif numpy.issubdtype(type(index), numpy.int_): - self.type = 'position' + self.type = IndexType.POSITION else: raise PanSchIndexError('The index value was not either an integer or string, or an array of either of ' 'these') - - def __call__(self, df: pandas.DataFrame, axis: int = 0): + def __call__(self, df: pandas.DataFrame): """ Apply this index :param df: The DataFrame to index :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column """ - if self.type == 'label': - return df.loc(axis=axis)[self.index] - elif self.type == 'label': - return df.iloc(axis=axis)[self.index] + if self.type == IndexType.LABEL: + return df.loc(axis=self.axis)[self.index] + elif self.type == IndexType.POSITION: + return df.iloc(axis=self.axis)[self.index] class RowIndexer(PandasIndexer): - def __call__(self, df: pandas.DataFrame): - return super().__call__(df, axis=0) + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=0) class ColumnIndexer(PandasIndexer): - def __call__(self, df: pandas.DataFrame): - return super().__call__(df, axis=1) + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=1) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index d05e180..b2ffae1 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -7,7 +7,7 @@ import operator from . import column -from .core import IndexSeriesValidation, BooleanSeriesValidation +from .core import SeriesValidation, BooleanSeriesValidation, IndexValidation from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -19,7 +19,7 @@ class CustomSeriesValidation(BooleanSeriesValidation): Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) """ - def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): + def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], *args, **kwargs): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -29,8 +29,9 @@ def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal to True if the object passed validation, and False if it failed """ + super().__init__(*args, **kwargs) self._validation = validation - super().__init__(message=message) + def select_cells(self, series: pd.Series) -> pd.Series: return self._validation(series) @@ -41,8 +42,7 @@ class CustomElementValidation(BooleanSeriesValidation): Validates using a user-provided function that operates on each element """ - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], - message: str): + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], *args, **kwargs): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -53,7 +53,7 @@ def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], the validation, and false if it doesn't """ self._validation = validation - super().__init__(message=message) + super().__init__(*args, **kwargs) def select_cells(self, series: pd.Series) -> pd.Series: return series.apply(self._validation) @@ -82,7 +82,7 @@ def select_cells(self, series: pd.Series) -> pd.Series: return (series >= self.min) & (series < self.max) -class IsDtypeValidation(IndexSeriesValidation): +class IsDtypeValidation(SeriesValidation): """ Checks that a series has a certain numpy dtype """ @@ -126,7 +126,8 @@ def __init__(self, func: typing.Callable, **kwargs): type)) super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'raised an exception when the callable {} was called on it'.format( self.callable) @@ -161,7 +162,8 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'cannot be converted to type {}'.format(self.callable) @@ -180,7 +182,8 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'does not match the pattern "{}"'.format(self.pattern) def select_cells(self, series: pd.Series) -> pd.Series: @@ -195,7 +198,8 @@ class TrailingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'contains trailing whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -210,7 +214,8 @@ class LeadingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'contains leading whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -225,7 +230,8 @@ class IsDistinctValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'contains values that are not unique' def select_cells(self, series: pd.Series) -> pd.Series: @@ -246,7 +252,8 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) @@ -271,7 +278,8 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): diff --git a/test/test_validation.py b/test/test_validation.py index c02615e..a0c316c 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,41 +10,43 @@ import pandas as pd from pandas_schema.validations import * -from pandas_schema.core import BooleanSeriesValidation +from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, BaseValidation from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning -class ValidationTestBase(unittest.TestCase): - def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): - if not s1.equals(s2): - raise self.failureException(msg) - - def validate_and_compare(self, series: list, expected_result: bool, msg: str = None): - """ - Checks that every element in the provided series is equal to `expected_result` after validation - :param series_dtype: Explicitly specifies the dtype for the generated Series - :param series: The series to check - :param expected_result: Whether the elements in this series should pass the validation - :param msg: The message to display if this test fails - """ +def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ValidationWarning]: + """ + Tests a validator by asserting that it generates the amount of warnings + :param series_dtype: Explicitly specifies the dtype for the generated Series + :param series: The series to check + :param expected_result: Whether the elements in this series should pass the validation + :param msg: The message to display if this test fails + """ - # Check that self.validator is correct - if not self.validator or not isinstance(self.validator, BooleanSeriesValidation): - raise ValueError('The class must have the validator field set to an instance of a Validation subclass') + # # Check that self.validator is correct + # if not self.validator or not isinstance(self.validator, BooleanSeriesValidation, index=0): + # raise ValueError('The class must have the validator field set to an instance of a Validation subclass') + # + # # Ensure we're comparing series correctly + # self.addTypeEqualityFunc(pd.Series, self.seriesEquality) - # Ensure we're comparing series correctly - self.addTypeEqualityFunc(pd.Series, self.seriesEquality) + df = pd.Series(series).to_frame() + warnings = validator.validate(df) + return list(warnings) + # + # # Now find any items where their validation does not correspond to the expected_result + # for item, result in zip(series, results): + # with self.subTest(value=item): + # self.assertEqual(result, expected_result, msg) - # Convert the input list to a series and validate it - results = self.validator.select_cells(pd.Series(series)) - # Now find any items where their validation does not correspond to the expected_result - for item, result in zip(series, results): - with self.subTest(value=item): - self.assertEqual(result, expected_result, msg) +class ValidationTestBase(unittest.TestCase): + def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): + if not s1.equals(s2): + raise self.failureException(msg) class CustomSeries(ValidationTestBase): @@ -53,13 +55,17 @@ class CustomSeries(ValidationTestBase): """ def setUp(self): - self.validator = CustomSeriesValidation(lambda s: ~s.str.contains('fail'), 'contained the word fail') + self.validator = CustomSeriesValidation( + lambda s: ~s.str.contains('fail'), + message='contained the word fail', + index=0 + ) def test_valid_inputs(self): - self.validate_and_compare(['good', 'success'], True, 'did not accept valid inputs') + assert len(get_warnings(self.validator, ['good', 'success'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - self.validate_and_compare(['fail', 'failure'], False, 'accepted invalid inputs') + assert len(get_warnings(self.validator, ['fail', 'failure'])) == 2, 'accepted invalid inputs' class CustomElement(ValidationTestBase): @@ -68,13 +74,18 @@ class CustomElement(ValidationTestBase): """ def setUp(self): - self.validator = CustomElementValidation(lambda s: s.startswith('_start_'), "Didn't begin with '_start_'") + self.validator = CustomElementValidation( + lambda s: s.startswith('_start_'), + message="Didn't begin with '_start_'", + index=0 + ) def test_valid_inputs(self): - self.validate_and_compare(['_start_sdiyhsd', '_start_234fpwunxc\n'], True, 'did not accept valid inputs') + assert len( + get_warnings(self.validator, ['_start_sdiyhsd', '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - self.validate_and_compare(['fail', '324wfp9ni'], False, 'accepted invalid inputs') + assert len(get_warnings(self.validator, ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' class LeadingWhitespace(ValidationTestBase): @@ -83,43 +94,31 @@ class LeadingWhitespace(ValidationTestBase): """ def setUp(self): - self.validator = LeadingWhitespaceValidation() + self.validator = LeadingWhitespaceValidation(index=0) def test_validate_trailing_whitespace(self): - self.validate_and_compare( - [ - 'trailing space ', - 'trailing tabs ', - '''trailing newline - ''' - ], - True, - 'is incorrectly failing on trailing whitespace' - ) + assert len(get_warnings(self.validator, [ + 'trailing space ', + 'trailing tabs ', + '''trailing newline + ''' + ])) == 0, 'is incorrectly failing on trailing whitespace' def test_validate_leading_whitespace(self): - self.validate_and_compare( - [ - ' leading spaces', - ' leading tabs', - ''' - leading newline''', - ], - False, - 'does not detect leading whitespace' - ) + assert len(get_warnings(self.validator, [ + ' leading spaces', + ' leading tabs', + ''' + leading newline''', + ])) == 3, 'does not detect leading whitespace' def test_validate_middle_whitespace(self): - self.validate_and_compare( - [ - 'middle spaces', - 'middle tabs', - '''middle - newline''', - ], - True, - 'is incorrectly failing on central whitespace' - ) + assert len(get_warnings(self.validator, [ + 'middle spaces', + 'middle tabs', + '''middle + newline''', + ])) == 0, 'is incorrectly failing on central whitespace' class TrailingWhitespace(ValidationTestBase): @@ -128,44 +127,32 @@ class TrailingWhitespace(ValidationTestBase): """ def setUp(self): - self.validator = TrailingWhitespaceValidation() + self.validator = TrailingWhitespaceValidation(index=0) super().setUp() def test_validate_trailing_whitespace(self): - self.validate_and_compare( - [ - 'trailing space ', - 'trailing tabs ', - '''trailing newline - ''' - ], - False, - 'is not detecting trailing whitespace' - ) + assert len(get_warnings(self.validator, [ + 'trailing space ', + 'trailing tabs ', + '''trailing newline + ''' + ])) == 3, 'is not detecting trailing whitespace' def test_validate_leading_whitespace(self): - self.validate_and_compare( - [ - ' leading spaces', - ' leading tabs', - ''' - leading newline''', - ], - True, - 'is incorrectly failing on leading whitespace' - ) + assert len(get_warnings(self.validator, [ + ' leading spaces', + ' leading tabs', + ''' + leading newline''', + ])) == 0, 'is incorrectly failing on leading whitespace' def test_validate_middle_whitespace(self): - self.validate_and_compare( - [ - 'middle spaces', - 'middle tabs', - '''middle - newline''', - ], - True, - 'is incorrectly failing on central whitespace' - ) + assert len(get_warnings(self.validator, [ + 'middle spaces', + 'middle tabs', + '''middle + newline''', + ])) == 0, 'is incorrectly failing on central whitespace' class CanCallJson(ValidationTestBase): @@ -174,29 +161,21 @@ class CanCallJson(ValidationTestBase): """ def setUp(self): - self.validator = CanCallValidation(json.loads) + self.validator = CanCallValidation(json.loads, index=0) def test_validate_valid_json(self): - self.validate_and_compare( - [ - '[1, 2, 3]', - '{"a": 1.1, "b": 2.2, "c": 3.3}', - '"string"' - ], - True, - 'is incorrectly failing on valid JSON' - ) + assert len(get_warnings(self.validator, [ + '[1, 2, 3]', + '{"a": 1.1, "b": 2.2, "c": 3.3}', + '"string"' + ])) == 0, 'is incorrectly failing on valid JSON' def test_validate_invalid_json(self): - self.validate_and_compare( - [ - '[1, 2, 3', - '{a: 1.1, b: 2.2, c: 3.3}', - 'string' - ], - False, - 'is not detecting invalid JSON' - ) + assert len(get_warnings(self.validator, [ + '[1, 2, 3', + '{a: 1.1, b: 2.2, c: 3.3}', + 'string' + ])) == 3, 'is not detecting invalid JSON' class CanCallLambda(ValidationTestBase): @@ -206,29 +185,21 @@ class CanCallLambda(ValidationTestBase): def setUp(self): # Succeed if it's divisible by 2, otherwise cause an error - self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0) + self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, index=0) def test_validate_noerror(self): - self.validate_and_compare( - [ - 2, - 4, - 6 - ], - True, - 'is incorrectly failing on even numbers' - ) + assert len(get_warnings(self.validator, [ + 2, + 4, + 6 + ])) == 0, 'is incorrectly failing on even numbers' def test_validate_error(self): - self.validate_and_compare( - [ - 1, - 3, - 5 - ], - False, - 'should fail on odd numbers' - ) + assert len(get_warnings(self.validator, [ + 1, + 3, + 5 + ])) == 3, 'should fail on odd numbers' class CanConvertInt(ValidationTestBase): @@ -237,161 +208,117 @@ class CanConvertInt(ValidationTestBase): """ def setUp(self): - self.validator = CanConvertValidation(int) + self.validator = CanConvertValidation(int, index=0) def test_valid_int(self): - self.validate_and_compare( - [ - '1', - '10', - '999', - '99999' - ], - True, - 'does not accept valid integers' - ) + assert len(get_warnings(self.validator, [ + '1', + '10', + '999', + '99999' + ])) == 0, 'does not accept valid integers' def test_invalid_int(self): - self.validate_and_compare( - [ - '1.0', - '9.5', - 'abc', - '1e-6' - ], - False, - 'accepts invalid integers' - ) + assert len(get_warnings(self.validator, [ + '1.0', + '9.5', + 'abc', + '1e-6' + ])) == 4, 'accepts invalid integers' class InListCaseSensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c']) + self.validator = InListValidation(['a', 'b', 'c'], index=0) def test_valid_elements(self): - self.validate_and_compare( - [ - 'a', - 'b', - 'c' - ], - True, - 'does not accept elements that are in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'a', + 'b', + 'c' + ])) == 0, 'does not accept elements that are in the validation list' def test_invalid_elements(self): - self.validate_and_compare( - [ - 'aa', - 'bb', - 'd', - 'A', - 'B', - 'C' - ], - False, - 'accepts elements that are not in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'aa', + 'bb', + 'd', + 'A', + 'B', + 'C' + ])) == 6, 'accepts elements that are not in the validation list' class InListCaseInsensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) def test_valid_elements(self): - self.validate_and_compare( - [ - 'a', - 'b', - 'c', - 'A', - 'B', - 'C' - ], - True, - 'does not accept elements that are in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'a', + 'b', + 'c', + 'A', + 'B', + 'C' + ])) == 0, 'does not accept elements that are in the validation list' def test_invalid_elements(self): - self.validate_and_compare( - [ - 'aa', - 'bb', - 'd', - ], - False, - 'accepts elements that are not in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'aa', + 'bb', + 'd', + ])) == 3, 'accepts elements that are not in the validation list' class DateFormat(ValidationTestBase): def setUp(self): - self.validator = DateFormatValidation('%Y%m%d') + self.validator = DateFormatValidation('%Y%m%d', index=0) def test_valid_dates(self): - self.validate_and_compare( - [ - '20160404', - '00011212' - ], - True, - 'does not accept valid dates' - ) + assert len(get_warnings(self.validator, [ + '20160404', + '00011212' + ])) == 0, 'does not accept valid dates' def test_invalid_dates(self): - self.validate_and_compare( - [ - '1/2/3456', - 'yyyymmdd', - '11112233' - ], - False, - 'accepts invalid dates' - ) + assert len(get_warnings(self.validator, [ + '1/2/3456', + 'yyyymmdd', + '11112233' + ])) == 3, 'accepts invalid dates' class StringRegexMatch(ValidationTestBase): def setUp(self): - self.validator = MatchesPatternValidation('^.+\.txt$') + self.validator = MatchesPatternValidation('^.+\.txt$', index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - 'pass.txt', - 'a.txt', - 'lots of words.txt' - ], - True, - 'does not accept strings matching the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txt', + 'a.txt', + 'lots of words.txt' + ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - self.validate_and_compare( - [ - 'pass.TXT', - '.txt', - 'lots of words.tx' - ], - False, - 'accepts strings that do not match the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.TXT', + '.txt', + 'lots of words.tx' + ])) == 3, 'accepts strings that do not match the regex' class IsDistinct(ValidationTestBase): def setUp(self): - self.validator = IsDistinctValidation() + self.validator = IsDistinctValidation(index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - '1', - '2', - '3', - '4' - ], - True, - 'does not accept unique strings' - ) + assert len(get_warnings(self.validator, [ + '1', + '2', + '3', + '4' + ])) == 0, 'does not accept unique strings' def test_invalid_strings(self): validation = self.validator.select_cells(pd.Series([ @@ -415,29 +342,21 @@ class CompiledRegexMatch(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE)) + self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE), index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - 'pass.txt', - 'a.TXT', - 'lots of words.tXt' - ], - True, - 'does not accept strings matching the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txt', + 'a.TXT', + 'lots of words.tXt' + ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - self.validate_and_compare( - [ - 'pass.txtt', - '.txt', - 'lots of words.tx' - ], - False, - 'accepts strings that do not match the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txtt', + '.txt', + 'lots of words.tx' + ])) == 3, 'accepts strings that do not match the regex' class InRange(ValidationTestBase): @@ -446,29 +365,21 @@ class InRange(ValidationTestBase): """ def setUp(self): - self.validator = InRangeValidation(7, 9) + self.validator = InRangeValidation(7, 9, index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 7, - 8, - 7 - ], - True, - 'does not accept integers in the correct range' - ) + assert len(get_warnings(self.validator, [ + 7, + 8, + 7 + ])) == 0, 'does not accept integers in the correct range' def test_invalid_items(self): - self.validate_and_compare( - [ - 1, - 2, - 3 - ], - False, - 'Incorrectly accepts integers outside of the range' - ) + assert len(get_warnings(self.validator, [ + 1, + 2, + 3 + ])) == 3, 'Incorrectly accepts integers outside of the range' class Dtype(ValidationTestBase): @@ -477,7 +388,7 @@ class Dtype(ValidationTestBase): """ def setUp(self): - self.validator = IsDtypeValidation(np.number) + self.validator = IsDtypeValidation(np.number, index=0) def test_valid_items(self): errors = self.validator.validate_series(pd.Series( @@ -500,7 +411,6 @@ def test_invalid_items(self): self.assertEqual(len(errors), 1) self.assertEqual(type(errors[0]), ValidationWarning) - def test_schema(self): """ Test this validation inside a schema, to ensure we get helpful error messages. @@ -530,36 +440,27 @@ def test_schema(self): ) - class Negate(ValidationTestBase): """ Tests the ~ operator on a MatchesPatternValidation """ def setUp(self): - self.validator = ~MatchesPatternValidation('fail') + self.validator = ~MatchesPatternValidation('fail', index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 'Pass', - '1', - 'True' - ], - True, - 'Rejects values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'Pass', + '1', + 'True' + ])) == 0, 'Rejects values that should pass' def test_invalid_items(self): - self.validate_and_compare( - [ - 'fail', - 'thisfails', - 'failure' - ], - False, - 'Accepts values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'fail', + 'thisfails', + 'failure' + ])) == 3, 'Accepts values that should pass' class Or(ValidationTestBase): @@ -568,30 +469,27 @@ class Or(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass') + self.validator = CombinedValidation( + MatchesPatternValidation('yes', index=0), + MatchesPatternValidation('pass', index=0), + 'or' + ) + # self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass', index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 'pass', - 'yes', - 'passyes', - '345yes345' - ], - True, - 'Rejects values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'pass', + 'yes', + 'passyes', + '345yes345' + ])) == 0, 'rejects values that should pass' def test_invalid_items(self): - self.validate_and_compare( - [ - 'fail', - 'YES', - 'YPESS' - ], - False, - 'Accepts values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'fail', + 'YES', + 'YPESS' + ])) == 6, 'accepts values that should pass' class CustomMessage(ValidationTestBase): @@ -610,7 +508,7 @@ def test_default_message(self): 2, 3 ] - )): + ), flatten=True): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): @@ -621,7 +519,7 @@ def test_custom_message(self): 2, 3 ] - )): + ), flatten=True): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') @@ -634,17 +532,17 @@ def setUp(self): self.vals = [1.0, None, 3] def test_in_range_allow_empty_with_error(self): - validator = InRangeValidation(min=4) + validator = InRangeValidation(min=4, index=0) errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): - validator = InRangeValidation(min=0) + validator = InRangeValidation(min=0, index=0) errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): - validator = InRangeValidation(min=4) + validator = InRangeValidation(min=4, index=0) errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), len(self.vals)) @@ -655,7 +553,7 @@ class PandasDtypeTests(ValidationTestBase): """ def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) def test_valid_elements(self): errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) From bc7f269a57c1a15e7ca563a121fb72dbca24b953 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 20 Mar 2020 02:19:34 +1100 Subject: [PATCH 12/13] All tests passing; fixed message generation, fixed negation --- TODO.md | 5 ++- pandas_schema/core.py | 62 +++++++++++++++++++++-------- pandas_schema/validation_warning.py | 10 ++++- pandas_schema/validations.py | 29 +++++--------- test/test_validation.py | 32 ++++++++++----- 5 files changed, 89 insertions(+), 49 deletions(-) diff --git a/TODO.md b/TODO.md index 0a6c2c6..f777a57 100755 --- a/TODO.md +++ b/TODO.md @@ -2,4 +2,7 @@ * [x] Fix CombinedValidations * [x] Add replacement for allow_empty Columns * [ ] New column() tests -* [ ] New CombinedValidation tests \ No newline at end of file +* [ ] New CombinedValidation tests +* [x] Fix Negate +* [ ] Add facility for allow_empty +* [x] Fix messages diff --git a/pandas_schema/core.py b/pandas_schema/core.py index a7963a6..28561dc 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -12,7 +12,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer, IndexValue +from pandas_schema.index import PandasIndexer, IndexValue, IndexType from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -34,9 +34,10 @@ def message(self, warning: ValidationWarning) -> str: pass -class IndexValidation(BaseValidation): +class IndexValidation(BaseValidation, metaclass=abc.ABCMeta): """ - Mixin for Validation classes, giving them access to an index for selecting a Series out of the DataFrame + Abstract class that builds on BaseValidation to give it access to an index for selecting a Series out of the + DataFrame """ def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str = None, **kwargs): @@ -54,14 +55,14 @@ def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str self.custom_message = message def message(self, warning: ValidationWarning) -> str: - prefix = self.prefix() + prefix = self.prefix(warning) if self.custom_message: suffix = self.custom_message else: - suffix = self.default_message + suffix = self.default_message(warning) - return "Column {} {}".format(prefix, suffix) + return "{} {}".format(prefix, suffix) @property def readable_name(self, **kwargs): @@ -70,7 +71,7 @@ def readable_name(self, **kwargs): """ return type(self).__name__ - def default_message(self) -> str: + def default_message(self, warnings: ValidationWarning) -> str: return 'failed the {}'.format(self.readable_name) def select_series(self, df: pd.DataFrame) -> pd.Series: @@ -82,17 +83,19 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return self.index(df) - def prefix(self): + def prefix(self, warning: ValidationWarning): """ - Return a string that could be used to prefix a message that relates to this index + Return a string that can be used to prefix a message that relates to this index + + This method is safe to override """ if self.index is None: return "" - if self.index.type == 'position': - return self.index.index + if self.index.type == IndexType.POSITION: + return 'Column {}'.format(self.index.index) else: - return '"{}"'.format(self.index.index) + return 'Column "{}"'.format(self.index.index) # @@ -173,6 +176,10 @@ class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): because the data is in the same form for each cell. You need only define a :py:meth~default_message. """ + def __init__(self, *args, negated=False, **kwargs): + super().__init__(*args, **kwargs) + self.negated = negated + @abc.abstractmethod def select_cells(self, series: pd.Series) -> pd.Series: """ @@ -187,9 +194,17 @@ def validate_series(self, series, flatten=True) -> typing.Union[ pd.Series ]: """ - Utility method for shortcutting data-frame validation and instead validating only a single series + Validates a single series selected from the DataFrame """ - failed = ~self.select_cells(series) + selection = self.select_cells(series) + + if self.negated: + # If self.negated (which is not the default), then we don't need to flip the booleans + failed = selection + else: + # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass + # the validation, and we want cells that failed it + failed = ~selection # Slice out the failed items, then map each into a list of validation warnings at each respective index warnings = series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { @@ -198,6 +213,7 @@ def validate_series(self, series, flatten=True) -> typing.Union[ })], axis='columns', result_type='reduce') # warnings = warnings.iloc[:, 0] + # If flatten, return a list of ValidationWarning, otherwise return a series of lists of Validation Warnings if flatten: return self.flatten_warning_series(warnings) else: @@ -206,12 +222,24 @@ def validate_series(self, series, flatten=True) -> typing.Union[ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: """ Validates a series and returns a series of warnings. - This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series - :param series: The series to validate """ series = self.select_series(df) return self.validate_series(series, flatten=False) + def prefix(self, warning: ValidationWarning): + parent = super().prefix(warning) + # Only in this subclass do we know the contents of the warning props, since we defined them in the + # validate_series method. Thus, we can now add row index information + + return parent + ', Row {row}: "{value}"'.format(**warning.props) + + def __invert__(self) -> 'BooleanSeriesValidation': + """ + If a BooleanSeriesValidation is negated, it has the opposite result + """ + self.negated = not self.negated + return self + class CombinedValidation(WarningSeriesGenerator): """ @@ -249,5 +277,5 @@ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: return combined @property - def default_message(self): + def default_message(self, warnings: ValidationWarning) -> str: return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 3eec3db..e6e3ddd 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -1,4 +1,3 @@ -import pandas_schema from dataclasses import dataclass, field @@ -8,7 +7,7 @@ class ValidationWarning: Represents a difference between the schema and data frame, found during the validation of the data frame """ - validation: 'pandas_schema.BaseValidation' + validation: 'pandas_schema.core.BaseValidation' """ The validation that spawned this warning """ @@ -22,4 +21,11 @@ class ValidationWarning: @property def message(self): + """ + Return this validation as a string + """ + # Internally, this actually asks the validator class to formulate a message return self.validation.message(self) + + def __str__(self): + return self.message diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index b2ffae1..2e803df 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -73,8 +73,7 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'was not in the range [{}, {})'.format(self.min, self.max) def select_cells(self, series: pd.Series) -> pd.Series: @@ -126,8 +125,7 @@ def __init__(self, func: typing.Callable, **kwargs): type)) super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'raised an exception when the callable {} was called on it'.format( self.callable) @@ -162,8 +160,7 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'cannot be converted to type {}'.format(self.callable) @@ -182,9 +179,8 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - @property - def default_message(self): - return 'does not match the pattern "{}"'.format(self.pattern) + def default_message(self, warning: ValidationWarning): + return 'does not match the pattern "{}"'.format(self.pattern.pattern) def select_cells(self, series: pd.Series) -> pd.Series: return series.astype(str).str.contains(self.pattern, **self.options) @@ -198,8 +194,7 @@ class TrailingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains trailing whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -214,8 +209,7 @@ class LeadingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains leading whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -230,8 +224,7 @@ class IsDistinctValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains values that are not unique' def select_cells(self, series: pd.Series) -> pd.Series: @@ -252,8 +245,7 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) @@ -278,8 +270,7 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): diff --git a/test/test_validation.py b/test/test_validation.py index a0c316c..90e6b0c 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -291,7 +291,7 @@ def test_invalid_dates(self): class StringRegexMatch(ValidationTestBase): def setUp(self): - self.validator = MatchesPatternValidation('^.+\.txt$', index=0) + self.validator = MatchesPatternValidation(r'^.+\.txt$', index=0) def test_valid_strings(self): assert len(get_warnings(self.validator, [ @@ -352,11 +352,22 @@ def test_valid_strings(self): ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - assert len(get_warnings(self.validator, [ + test_data = [ 'pass.txtt', '.txt', 'lots of words.tx' - ])) == 3, 'accepts strings that do not match the regex' + ] + warnings = get_warnings(self.validator, test_data) + + # Check that every piece of data failed + assert len(warnings) == 3, 'accepts strings that do not match the regex' + + # Also test the messages + for i, (warning, data) in enumerate(zip(warnings, test_data)): + assert 'Row {}'.format(i) in warning.message + assert 'Column 0' in warning.message + assert data in warning.message + assert self.validator.pattern.pattern in warning.message class InRange(ValidationTestBase): @@ -501,7 +512,7 @@ def setUp(self): self.message = "UNUSUAL MESSAGE THAT WOULDN'T BE IN A NORMAL ERROR" def test_default_message(self): - validator = InRangeValidation(min=4) + validator = InRangeValidation(min=4, index=0) for error in validator.validate_series(pd.Series( [ 1, @@ -512,7 +523,7 @@ def test_default_message(self): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): - validator = InRangeValidation(min=4, message=self.message) + validator = InRangeValidation(min=4, message=self.message, index=0) for error in validator.validate_series(pd.Series( [ 1, @@ -523,6 +534,7 @@ def test_custom_message(self): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') +@unittest.skip('allow_empty no longer exists') class GetErrorTests(ValidationTestBase): """ Tests for float valued columns where allow_empty=True @@ -556,17 +568,17 @@ def setUp(self): self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) def test_valid_elements(self): - errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) - self.assertEqual(len(list(errors)), 0) + errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) + assert len(list(errors)) == 0 def test_invalid_empty_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) - self.assertEqual(len(list(errors)), 4) + assert len(list(errors)) == 4 def test_invalid_and_empty_elements(self): errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) - self.assertEqual(len(list(errors)), 1) + assert len(list(errors)) == 1 def test_invalid_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) - self.assertEqual(len(list(errors)), 3) + assert len(list(errors)) == 3 From f8ce653cff196e2b58013563010a17881bcd80f2 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sat, 21 Mar 2020 16:09:48 +1100 Subject: [PATCH 13/13] Fix or operator --- TODO.md | 2 ++ UPDATE.md | 14 ++++++++++ pandas_schema/core.py | 8 ++++++ setup.py | 2 +- test/test_validation.py | 59 ++++++++++++++++++++++++++--------------- 5 files changed, 62 insertions(+), 23 deletions(-) diff --git a/TODO.md b/TODO.md index f777a57..3cdd8fb 100755 --- a/TODO.md +++ b/TODO.md @@ -6,3 +6,5 @@ * [x] Fix Negate * [ ] Add facility for allow_empty * [x] Fix messages +* [x] Re-implement the or/and using operators +* [ ] Allow and/or operators between Series-level and row-level validations diff --git a/UPDATE.md b/UPDATE.md index 5bbc15f..c6c8a1d 100755 --- a/UPDATE.md +++ b/UPDATE.md @@ -31,3 +31,17 @@ that spawned it * In order to make both CombinedValidation and BooleanSeriesValidation both share a class, so they can be chained together, either we had to make a mixin that creates a "side path" that doesn't call `validate` (in this case, `validate_with_series`), or we + +# Rework of Validation Indexing +## All Indexed +* All Validations now have an index and an axis +* However, this index can be none, can be column only, row only, or both +* When combined with each other, the resulting boolean series will be broadcast using numpy broadcasting rules +* e.g. + * A per-series validation might have index 0 (column 0) and return a scalar (the whole series is okay) + * A per-cell validation might have index 0 (column 0) and return a series (True, True, False) indicating that cell 0 and 1 of column 0 are okay + * A per-frame validation would have index None, and might return True if the whole frame meets the validation, or a series indicating which columns or rows match the validation + +# Rework of combinedvalidations +## Bitwise +* Could assign each validation a bit in a large bitwise enum, and `or` together a number each time that index fails a validatioin. This lets us track the origin of each warning, allowing us to slice them out by bit and generate an appropriate list of warnings \ No newline at end of file diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 28561dc..4435d1d 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -166,6 +166,14 @@ def validate(self, df: pd.DataFrame, flatten=True) -> typing.Union[ else: return warnings + def __or__(self, other: 'WarningSeriesGenerator'): + if not isinstance(other, WarningSeriesGenerator): + raise PanSchArgumentError('The "|" operator can only be used between two' + 'Validations that subclass {}'.format(self.__class__)) + + return CombinedValidation(self, other, operator='or') + + class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): """ diff --git a/setup.py b/setup.py index 2441567..ff6d9a4 100755 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ def run(self): packages=find_packages(include=['pandas_schema']), install_requires=[ 'numpy', - 'pandas>=0.19', + 'pandas>=0.23', 'dataclasses' ], cmdclass={ diff --git a/test/test_validation.py b/test/test_validation.py index 90e6b0c..2351434 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,14 +10,16 @@ import pandas as pd from pandas_schema.validations import * -from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, BaseValidation +from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, \ + BaseValidation from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning -def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ValidationWarning]: +def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ + ValidationWarning]: """ Tests a validator by asserting that it generates the amount of warnings :param series_dtype: Explicitly specifies the dtype for the generated Series @@ -62,10 +64,12 @@ def setUp(self): ) def test_valid_inputs(self): - assert len(get_warnings(self.validator, ['good', 'success'])) == 0, 'did not accept valid inputs' + assert len(get_warnings(self.validator, ['good', + 'success'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - assert len(get_warnings(self.validator, ['fail', 'failure'])) == 2, 'accepted invalid inputs' + assert len(get_warnings(self.validator, + ['fail', 'failure'])) == 2, 'accepted invalid inputs' class CustomElement(ValidationTestBase): @@ -82,10 +86,12 @@ def setUp(self): def test_valid_inputs(self): assert len( - get_warnings(self.validator, ['_start_sdiyhsd', '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' + get_warnings(self.validator, ['_start_sdiyhsd', + '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - assert len(get_warnings(self.validator, ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' + assert len(get_warnings(self.validator, + ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' class LeadingWhitespace(ValidationTestBase): @@ -185,7 +191,8 @@ class CanCallLambda(ValidationTestBase): def setUp(self): # Succeed if it's divisible by 2, otherwise cause an error - self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, index=0) + self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, + index=0) def test_validate_noerror(self): assert len(get_warnings(self.validator, [ @@ -251,7 +258,8 @@ def test_invalid_elements(self): class InListCaseInsensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, + index=0) def test_valid_elements(self): assert len(get_warnings(self.validator, [ @@ -342,7 +350,8 @@ class CompiledRegexMatch(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE), index=0) + self.validator = MatchesPatternValidation( + re.compile('^.+\.txt$', re.IGNORECASE), index=0) def test_valid_strings(self): assert len(get_warnings(self.validator, [ @@ -480,12 +489,11 @@ class Or(ValidationTestBase): """ def setUp(self): - self.validator = CombinedValidation( - MatchesPatternValidation('yes', index=0), - MatchesPatternValidation('pass', index=0), - 'or' + self.validator = MatchesPatternValidation( + 'yes', index=0 + ) | MatchesPatternValidation( + 'pass', index=0 ) - # self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass', index=0) def test_valid_items(self): assert len(get_warnings(self.validator, [ @@ -520,7 +528,8 @@ def test_default_message(self): 3 ] ), flatten=True): - self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') + self.assertNotRegex(error.message, self.message, + 'Validator not using the default warning message!') def test_custom_message(self): validator = InRangeValidation(min=4, message=self.message, index=0) @@ -531,7 +540,8 @@ def test_custom_message(self): 3 ] ), flatten=True): - self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') + self.assertRegex(error.message, self.message, + 'Validator not using the custom warning message!') @unittest.skip('allow_empty no longer exists') @@ -565,20 +575,25 @@ class PandasDtypeTests(ValidationTestBase): """ def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, + index=0) def test_valid_elements(self): - errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) - assert len(list(errors)) == 0 + errors = self.validator.validate_series( + pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) + assert len(list(errors)) == 0 def test_invalid_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) + errors = self.validator.validate_series( + pd.Series(['aa', 'bb', 'd', None], dtype='category')) assert len(list(errors)) == 4 def test_invalid_and_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) + errors = self.validator.validate_series( + pd.Series(['a', None], dtype='category')) assert len(list(errors)) == 1 def test_invalid_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) + errors = self.validator.validate_series( + pd.Series(['aa', 'bb', 'd'], dtype='category')) assert len(list(errors)) == 3