diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.travis.yml b/.travis.yml old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/TODO.md b/TODO.md new file mode 100755 index 0000000..eca42d5 --- /dev/null +++ b/TODO.md @@ -0,0 +1,14 @@ +* [ ] Add validations that apply to every column in the DF equally (for the moment, users can just duplicate their validations) +* [x] Add validations that use the entire DF like, uniqueness +* [x] Fix CombinedValidations +* [x] Add replacement for allow_empty Columns +* [ ] New column() tests +* [x] New CombinedValidation tests +* [x] Implement the negate flag in the indexer +* [x] Add facility for allow_empty +* [x] Fix messages +* [x] Re-implement the or/and using operators +* [ ] Allow and/or operators between Series-level and row-level validations +* [ ] Separate ValidationClasses for each scope +* [ ] Add row-level validations +* [x] Fix message for DateAndOr test \ No newline at end of file diff --git a/UPDATE.md b/UPDATE.md new file mode 100755 index 0000000..c6c8a1d --- /dev/null +++ b/UPDATE.md @@ -0,0 +1,47 @@ +# ValidationWarnings +## Options for the ValidationWarning data +* We keep it as is, with one single ValidationWarning class that stores a `message` and a reference to the validation +that spawned it +* PREFERRED: As above, but we add a dictionary of miscellaneous kwargs to the ValidationWarning for storing stuff like the row index that failed +* We have a dataclass for each Validation type that stores things in a more structured way + * Why bother doing this if the Validation stores its own structure for the column index etc? + +## Options for the ValidationWarning message +* It's generated from the Validation as a fixed string, as it is now +* It's generated dynamically by the VW + * This means that custom messages means overriding the VW class +* PREFERRED: It's generated dynamically in the VW by calling the parent Validation with a reference to itself, e.g. + ```python + class ValidationWarning: + def __str__(self): + return self.validation.generate_message(self) + + class Validation: + def generate_message(warning: ValidationWarning) -> str: + pass + ``` + * This lets the message function use all the validation properties, and the dictionary of kwargs that it specified + * `generate_message()` will call `default_message(**kwargs)`, the dynamic class method, or `self.custom_message`, the + non-dynamic string specified by the user + * Each category of Validation will define a `create_prefix()` method, that creates the {row: 1, column: 2} prefix + that goes before each message. Thus, `generate_message()` will concatenate that with the actual message +* + +## Options for placing CombinedValidation in the inheritance hierarchy +* In order to make both CombinedValidation and BooleanSeriesValidation both share a class, so they can be chained together, +either we had to make a mixin that creates a "side path" that doesn't call `validate` (in this case, `validate_with_series`), +or we + +# Rework of Validation Indexing +## All Indexed +* All Validations now have an index and an axis +* However, this index can be none, can be column only, row only, or both +* When combined with each other, the resulting boolean series will be broadcast using numpy broadcasting rules +* e.g. + * A per-series validation might have index 0 (column 0) and return a scalar (the whole series is okay) + * A per-cell validation might have index 0 (column 0) and return a series (True, True, False) indicating that cell 0 and 1 of column 0 are okay + * A per-frame validation would have index None, and might return True if the whole frame meets the validation, or a series indicating which columns or rows match the validation + +# Rework of combinedvalidations +## Bitwise +* Could assign each validation a bit in a large bitwise enum, and `or` together a number each time that index fails a validatioin. This lets us track the origin of each warning, allowing us to slice them out by bit and generate an appropriate list of warnings \ No newline at end of file diff --git a/doc/common/introduction.rst b/doc/common/introduction.rst old mode 100644 new mode 100755 diff --git a/doc/readme/README.rst b/doc/readme/README.rst old mode 100644 new mode 100755 diff --git a/doc/readme/conf.py b/doc/readme/conf.py old mode 100644 new mode 100755 diff --git a/doc/site/Makefile b/doc/site/Makefile old mode 100644 new mode 100755 diff --git a/doc/site/conf.py b/doc/site/conf.py old mode 100644 new mode 100755 diff --git a/doc/site/index.rst b/doc/site/index.rst old mode 100644 new mode 100755 diff --git a/example/boolean.py b/example/boolean.py old mode 100644 new mode 100755 diff --git a/example/boolean.txt b/example/boolean.txt old mode 100644 new mode 100755 diff --git a/example/example.py b/example/example.py old mode 100644 new mode 100755 diff --git a/example/example.txt b/example/example.txt old mode 100644 new mode 100755 diff --git a/pandas_schema/__init__.py b/pandas_schema/__init__.py old mode 100644 new mode 100755 index 6f7ff97..fabe184 --- a/pandas_schema/__init__.py +++ b/pandas_schema/__init__.py @@ -1,4 +1,2 @@ -from .column import Column from .validation_warning import ValidationWarning -from .schema import Schema from .version import __version__ diff --git a/pandas_schema/column.py b/pandas_schema/column.py old mode 100644 new mode 100755 index 199b883..828d6a0 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,27 +1,117 @@ -import typing -import pandas as pd - -from . import validation -from .validation_warning import ValidationWarning - -class Column: - def __init__(self, name: str, validations: typing.Iterable['validation._BaseValidation'] = [], allow_empty=False): - """ - Creates a new Column object - - :param name: The column header that defines this column. This must be identical to the header used in the CSV/Data Frame you are validating. - :param validations: An iterable of objects implementing _BaseValidation that will generate ValidationErrors - :param allow_empty: True if an empty column is considered valid. False if we leave that logic up to the Validation - """ - self.name = name - self.validations = list(validations) - self.allow_empty = allow_empty - - def validate(self, series: pd.Series) -> typing.List[ValidationWarning]: - """ - Creates a list of validation errors using the Validation objects contained in the Column - - :param series: A pandas Series to validate - :return: An iterable of ValidationError instances generated by the validation - """ - return [error for validation in self.validations for error in validation.get_errors(series, self)] +from typing import Union, Iterable + +from pandas_schema.core import IndexValidation, BaseValidation +from pandas_schema.index import AxisIndexer, IndexValue + + +def column( + validations: Union[Iterable['IndexValidation'], 'IndexValidation'], + index = None, + override: bool = False, + recurse: bool = True, + allow_empty: bool = False +) -> Union[Iterable['IndexValidation'], 'IndexValidation']: + """A utility method for setting the index data on a set of Validations + + Args: + validations: A list of validations to modify + index: The index of the series that these validations will now consider + override: If true, override existing index values. Otherwise keep the existing ones + recurse: If true, recurse into child validations + allow_empty: Allow empty rows (NaN) to pass the validation + See :py:class:`pandas_schema.validation.IndexSeriesValidation` (Default value = False) + Returns: + """ + # TODO: Abolish this, and instead propagate the individual validator indexes when we And/Or them together + def update_validation(validation: BaseValidation): + if isinstance(validation, IndexValidation): + if override or validation.index is None: + validation.index = index + + if allow_empty: + return validation.optional() + else: + return validation + + if isinstance(validations, Iterable): + ret = [] + for valid in validations: + if recurse: + ret.append(valid.map(update_validation)) + else: + ret.append(update_validation(valid)) + return ret + else: + if recurse: + return validations.map(update_validation) + else: + return update_validation(validations) + + return validations + + +def column_sequence( + validations: Iterable['IndexValidation'], + override: bool = False +) -> Iterable['IndexValidation']: + """A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so + that the first validation gets index 0, the second gets index 1 etc. Note: this will not modify any index that + already has some kind of index unless you set override=True + + Args: + validations: A list of validations to modify + override: If true, override existing index values. Otherwise keep the existing ones + validations: typing.Iterable['pandas_schema.core.IndexValidation']: + override: bool: (Default value = False) + + Returns: + + """ + for i, valid in validations: + if override or valid.index is None: + valid.index = AxisIndexer(i, typ='positional') + return validations + + +def each_column(validations: Iterable[IndexValidation], columns: IndexValue): + """Duplicates a validation and applies it to each column specified + + Args: + validations: A list of validations to apply to each column + columns: An index that should, when applied to the column index, should return all columns you want this to + validations: typing.Iterable[pandas_schema.core.IndexValidation]: + columns: IndexValue: + + Returns: + + """ + +# +# def label_column( +# validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], +# index: typing.Union[int, str], +# ): +# """ +# A utility method for setting the label-based column for each validation +# :param validations: A list of validations to modify +# :param index: The label of the series that these validations will now consider +# """ +# return _column( +# validations, +# index, +# position=False +# ) +# +# def positional_column( +# validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], +# index: int, +# ): +# """ +# A utility method for setting the position-based column for each validation +# :param validations: A list of validations to modify +# :param index: The index of the series that these validations will now consider +# """ +# return _column( +# validations, +# index, +# position=True diff --git a/pandas_schema/core.py b/pandas_schema/core.py new file mode 100755 index 0000000..dfd5af1 --- /dev/null +++ b/pandas_schema/core.py @@ -0,0 +1,576 @@ +import abc +import math +import datetime +from itertools import chain +import pandas as pd +import numpy as np +import typing +import operator +from dataclasses import dataclass +import enum +import copy +from math import isnan + +from .errors import PanSchArgumentError, PanSchNoIndexError +from pandas_schema.validation_warning import ValidationWarning, CombinedValidationWarning +from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, \ + DualAxisIndexer, BooleanIndexer +from pandas_schema.scope import ValidationScope +from pandas.api.types import is_categorical_dtype, is_numeric_dtype + +SubSelection = typing.Union[pd.Series, pd.DataFrame, object] +""" +Anything that an indexer could return from a DataFrame +""" + + +class BaseValidation(abc.ABC): + """ + A validation is, broadly, just a function that maps a data frame to a list of errors + """ + + def __init_subclass__(cls, scope: ValidationScope = ValidationScope.CELL, **kwargs): + # We override this so that you can set the scope at the time you declare the validation class, not the instance + cls.scope = scope + + def __init__(self, message: str = None): + """ + Creates a new IndexSeriesValidation + :param message: A custom message to use for ValidationWarnings generated by this validation + """ + self.custom_message = message + + def recurse(self, func: typing.Callable[['BaseValidation'], typing.Any]) -> list: + """ + Calls a function on this validation and all of its children (if this is a compound validation) + Args: + func: A function whose only argument is a single validation. The function might change the validation, or + if can return a value, in which case the value will be included in the final result + + Returns: + A list of result values + + """ + return [func(self)] + + def map(self, func: typing.Callable[['BaseValidation'], 'BaseValidation']) -> 'BaseValidation': + """ + Calls a function on this validation and all of its children (if this is a compound validation) + This function return a validation that will replace the validation it receives as an argument. + Args: + func: A function whose only argument is a single validation. The function might change the validation, or + if can return a value, in which case the value will be included in the final result + + Returns: + A list of result values + + """ + return func(self) + + def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: + """ + Creates a DF-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self) + + def make_row_warning(self, df: pd.DataFrame, row_index: IndexValue) -> ValidationWarning: + """ + Creates a series-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self, row=row_index) + + def make_series_warning(self, df: pd.DataFrame, column: str, + series: pd.Series) -> ValidationWarning: + """ + Creates a series-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self, column=column) + + def make_cell_warning(self, df: pd.DataFrame, column: str, row: int, value, + series: pd.Series = None) -> ValidationWarning: + """ + Creates a cell-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self, column=column, row=row, value=value) + + def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): + """ + Takes an index that points to parts of the DF that have *failed* validation, and returns a Series (or similar) + that has ValidationWarning instances at each index that has failed + :param df: The DataFrame we're validating + :param index: The index pointing to the failed parts of the DF + :param failed: The result of applying index to the DF + """ + + # If it's am empty series/frame then this produced no warnings + if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: + return pd.Series() + + # Depending on the scope, we produce the lists of warnings in different ways (ideally the most efficient ways) + if isinstance(failed, pd.DataFrame): + if self.scope == ValidationScope.DATA_FRAME: + return [self.make_df_warning(df)] + elif self.scope == ValidationScope.SERIES: + return failed.apply(lambda series: self.make_series_warning( + df=df, + column=series.name, + series=series + ), axis='rows') + elif self.scope == ValidationScope.ROW: + return failed.apply(lambda row: self.make_row_warning( + df=df, + row_index=row.name + ), axis='columns') + elif self.scope == ValidationScope.CELL: + return failed.apply(lambda series: series.to_frame().apply( + lambda cell: self.make_cell_warning( + df=df, + column=series.name, + series=series, + row=cell.name, + value=cell + ))) + elif isinstance(failed, pd.Series): + if self.scope == ValidationScope.SERIES: + return [self.make_series_warning( + df=df, + column=failed.name, + series=failed + )] + elif self.scope == ValidationScope.CELL: + # DataFrame.apply returns a series if the function returns a scalar, as it does here + return failed.to_frame().apply(lambda cell: self.make_cell_warning( + df=df, + column=index.col_index.index, + series=failed, + row=cell.name, + value=cell[0] + ), axis='columns') + else: + return [self.make_cell_warning( + df=df, + column=index.col_index.index, + row=index.row_index.index, + value=failed) + ] + + def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: + """ + Converts an index into a series of warnings each corresponding to an issue + with the DataFrame at the same index. + """ + index = self.get_failed_index(df) + failed = index(df) + + return self.index_to_warnings_series(df, index, failed) + + @staticmethod + def to_warning_list(failed: SubSelection): + """ + Converts a Series/DF of warnings to a list of warnings + """ + if isinstance(failed, pd.DataFrame): + return failed.to_numpy().tolist() + elif isinstance(failed, pd.Series): + return failed.tolist() + else: + return failed + + def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: + """ + Validates a data frame and returns a list of issues with it + :param df: Data frame to validate + :return: All validation failures detected by this validation + """ + failed = self.get_warnings_series(df) + return self.to_warning_list(failed) + + @abc.abstractmethod + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + """ + Returns an indexer object that fully specifies which sections of the DataFrame this validation believes are + invalid (both row and column-wise) + """ + + @abc.abstractmethod + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + """ + Returns an indexer object that fully specifies which sections of the DataFrame this validation believes are + valid (both row and column-wise) + """ + + def message(self, warning: ValidationWarning) -> str: + """ + Get a string that fully describes the provided warning, given that the warning was generating by this validation + """ + return "{} {}".format(self.prefix(warning), self.suffix(warning)) + + def prefix(self, warning: ValidationWarning): + """ + Return a string that can be used to prefix a message that relates to this index + + This method is safe to override + """ + return "" + + def suffix(self, warning: ValidationWarning): + # The suffix can be overridden in two ways, either using a custom message (the most common), or with a custom + # default_message() function + if self.custom_message: + return self.custom_message + else: + return self.default_message(warning) + + @property + def readable_name(self): + """ + A readable name for this validation, to be shown in validation warnings + """ + return type(self).__name__ + + def default_message(self, warning: ValidationWarning) -> str: + """ + Returns a description of this validation, to be included in the py:meth:~message as the suffix`` + """ + return 'failed the {}'.format(self.readable_name) + + def __or__(self, other: 'BaseValidation'): + """ + Returns a validation that will only return an error if both validations fail at the same place + :param other: Another validation to combine with this + """ + if not isinstance(other, BaseValidation): + raise PanSchArgumentError('The "|" operator can only be used between two' + 'Validations that subclass {}'.format( + self.__class__)) + + # TODO: Propagate the individual validator indexes when we And/Or them together + return CombinedValidation(self, other, operator=operator.or_) + + def __and__(self, other: 'BaseValidation'): + """ + Returns a validation that will only return an error if both validations fail at the same place + :param other: Another validation to combine with this + """ + if not isinstance(other, BaseValidation): + raise PanSchArgumentError('The "&" operator can only be used between two' + 'Validations that subclass {}'.format( + self.__class__)) + + # TODO: Propagate the individual validator indexes when we And/Or them together + return CombinedValidation(self, other, operator=operator.and_) + + +class IndexValidation(BaseValidation): + """ + An IndexValidation expands upon a BaseValidation by adding an index (in Pandas co-ordinates) that points to the + Series/DF sub-selection/row/cell that it validates + """ + + def __init__( + self, + index: DualAxisIndexer, + *args, + **kwargs + ): + """ + Creates a new IndexSeriesValidation + :param index: An index with which to select the series + Otherwise it's a label (ie, index=0) indicates the column with the label of 0 + """ + super().__init__(*args, **kwargs) + self.index = index + + def apply_index(self, df: pd.DataFrame) -> SubSelection: + """ + Select a series using the data stored in this validation + """ + return self.index(df) + + def prefix(self, warning: ValidationWarning): + ret = [] + + if self.index.col_index is not None: + col_str = self.index.col_index.for_message() + if col_str: + ret.append(col_str) + + ret.append('Row {}'.format(warning.props['row'])) + + ret.append('Value "{}"'.format(warning.props['value'])) + + return '{' + ', '.join(ret) + '}' + + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + selection = self.apply_index(df) + return self.validate_selection(selection) + + @abc.abstractmethod + def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: + """ + Given a selection, return an indexer that points to elements that passed the validation + """ + pass + + def optional(self) -> 'CombinedValidation': + """ + Makes this Validation optional, by returning a CombinedValidation that accepts empty cells + """ + return CombinedValidation( + self, + IsEmptyValidation(index=self.index), + operator=operator.or_ + ) + + +class SeriesValidation(IndexValidation): + """ + A type of IndexValidation that expands IndexValidation with the knowledge that it will validate a single Series + """ + _index: typing.Optional[DualAxisIndexer] + + def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer] = None, negated: bool=False, *args, **kwargs): + """ + Create a new SeriesValidation + :param index: The index pointing to the Series to validate. For example, this might be 2 to validate Series + with index 2, or "first_name" to validate a Series named "first_name". For more advanced indexing, you may + pass in an instance of the RowIndexer class + """ + # This convets the index from primitive numbers into a data structure + self._index = None + self.index = index + + super().__init__( + *args, + index=self.index, + **kwargs + ) + + self.negated = negated + """ + This broadly means that this validation will do the opposite of what it normally does. The actual implementation + depends on the subclass checking for this field whenever it needs to. Even for IndexValidations, we can't invert + the actual index, because it doesn't exist yet. It's only created after we run the actual validation + """ + + @property + def index(self): + return self._index + + @index.setter + def index(self, val): + # We have to convert a single-axis index into a dual-axis index + if val is not None: + if isinstance(val, DualAxisIndexer): + self._index = val + else: + self._index = DualAxisIndexer( + col_index=val, + row_index=BooleanIndexer(index=True, axis=0), + ) + + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + index = super().get_passed_index(df) + if self.negated: + return index.invert(axis=0) + else: + return index + + def get_failed_index(self, df) -> DualAxisIndexer: + # This is the opposite of get_passed_index, so we just flip the conditional + index = super().get_passed_index(df) + if self.negated: + return index + else: + return index.invert(axis=0) + + def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: + """ + Since this is a SeriesValidation, we can simplify the validation. Now we only have to ask the subclass to take + a Series and return a Series (or slice) that indicates the successful cells (or series). Then we can combine + this with the current index to produce an indexer that finds all failing cells in the DF + """ + row_index = self.validate_series(selection) + + # As a convenience, we allow validate_series to return a boolean. If True it indicates everything passed, so + # convert it to a None slice which returns everything, and if false convert it to an empty list, an indexer + # that returns nothing + # if isinstance(row_index, bool): + # if row_index: + # row_index = slice(None) + # else: + # row_index = [] + + return DualAxisIndexer( + row_index=BooleanIndexer(row_index, axis=0), + col_index=self.index.col_index + ) + + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> IndexValue: + """ + Given a series, return a bool Series that has values of True if the series + passes the validation, otherwise False + """ + + def __invert__(self) -> 'BaseValidation': + """ + Returns: A copy of this validation, but that validates the opposite of what it normally would + """ + clone = copy.copy(self) + clone.negated = True + return clone + + +class CombinedValidation(BaseValidation): + """ + Validates if one and/or the other validation is true for an element + """ + + def __init__( + self, + validation_a: BaseValidation, + validation_b: BaseValidation, + operator: typing.Callable[[pd.Series, pd.Series], pd.Series], + axis='rows' + ): + """ + Creates a new CombinedValidation + :param validation_a: The first validation to combine + :param validation_b: The second validation to combine + :param operator: An operator, likely operator.or_ or operator.and_ that we should use to combine Validations + :param axis: The axis across which to combine validations. If this is "rows", then we keep the column indices + of each result, and combine the row indices (the most common option). If this is "columns", do the opposite + """ + super().__init__() + self.operator = operator + self.left = validation_a + self.right = validation_b + self.axis = axis + + def recurse(self, func: typing.Callable[['BaseValidation'], typing.Any]) -> list: + return [*super().recurse(func), *self.left.recurse(func), *self.right.recurse(func)] + + def map(self, func): + new = func(self) + new.left = new.left.map(func) + new.right = new.right.map(func) + return new + + # def message(self, warning: ValidationWarning) -> str: + # # Nothing should ever try to create a ValidationWarning directly from a CombinedValidation, + # # it should always use the original warnings from the child Validations + # raise NotImplementedError() + + # def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): + # # We handle this method by deferring to the children + + def combine_indices(self, left: DualAxisIndexer, right: DualAxisIndexer) -> DualAxisIndexer: + """ + Utility method for combining two indexers using boolean logic + """ + # TODO: convert axis into an integer and apply proper pandas logic + if self.axis == 'rows': + assert left.col_index == right.col_index + assert isinstance(left.row_index, BooleanIndexer) + return DualAxisIndexer( + row_index=BooleanIndexer(self.operator( + left.row_index.index, + right.row_index.index + ), axis=0), + col_index=left.col_index + ) + + elif self.axis == 'columns': + assert left.row_index == right.row_index + assert isinstance(left.col_index, BooleanIndexer) + return DualAxisIndexer( + row_index=left.row_index, + col_index=BooleanIndexer(self.operator( + left.col_index.index, + right.col_index.index + ), axis=1) + ) + + else: + raise Exception() + + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + left_passed = self.left.get_passed_index(df) + right_passed = self.right.get_passed_index(df) + return self.combine_indices(left_passed, right_passed) + + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + return self.get_passed_index(df).invert(self.axis) + + def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): + # In a normal validation this method would create new Validatations, and use the index, but we don't actually + # need either here + return self.get_warnings_series(df) + + def combine(self, left: SubSelection, right: SubSelection): + """ + Combine two subsections of the DataFrame, each containing :py:class:`pandas_schema.validation_warning.ValidationWarning` + instances + """ + + # Convert the data into warnings, and then join together the warnings from both validations + def combine_index(left, right): + # Make a CombinedValidationWarning if it failed both validations, otherwise return the single failure + if left: + if right: + return CombinedValidationWarning(left, right, validation=self) + else: + return left + else: + return right + + if isinstance(left, (pd.Series, pd.DataFrame)): + return left.combine(right, combine_index, fill_value=False) + elif isinstance(right, (pd.Series, pd.DataFrame)): + return right.combine(left, combine_index, fill_value=False) + else: + return combine_index(left, right) + + def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: + # Let both validations separately select and filter a column + left_index = self.left.get_passed_index(df) + right_index = self.right.get_passed_index(df) + + # Combine them with boolean logic + # We have to invert the combined index because left and right are *passed* indices not failed ones + combined = self.combine_indices(left_index, right_index).invert(axis=0) + + # Slice out the failed data + # We have to invert these because left_index and right_index are passed indices + left_failed = left_index.invert(axis=0)(df) + right_failed = right_index.invert(axis=0)(df) + + warnings = self.combine( + self.left.index_to_warnings_series(df, left_index, left_failed), + self.right.index_to_warnings_series(df, right_index, right_failed) + ) + # warnings = self.left.index_to_warnings_series(df, left_index, left_failed).combine( + # self.right.index_to_warnings_series(df, right_index, right_failed), + # func=combine, + # fill_value=False + # ) + + # Finally, apply the combined index from above to the warnings series + if self.axis == 'rows': + return warnings[combined.row_index.index] + else: + return warnings[combined.col_index.index] + + +class IsEmptyValidation(SeriesValidation): + """ + Validates that each element in the Series is "empty". For most dtypes, this means each element contains null, + but for strings we consider 0-length strings to be empty + """ + + def validate_series(self, series: pd.Series) -> IndexValue: + if is_categorical_dtype(series) or is_numeric_dtype(series): + return series.isnull() + else: + return series.str.len() == 0 diff --git a/pandas_schema/df_validations.py b/pandas_schema/df_validations.py new file mode 100644 index 0000000..be372bc --- /dev/null +++ b/pandas_schema/df_validations.py @@ -0,0 +1,69 @@ +from abc import abstractmethod +from typing import Union + +import pandas as pd + +from . import ValidationWarning +from .core import BaseValidation, ValidationScope +from .index import DualAxisIndexer, BooleanIndexer + + +class DfRowValidation(BaseValidation): + """ + Validates the entire DF at once, by returning a boolean Series corresponding to row indices that pass or fail + """ + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + passed = self.get_passed_index(df) + return passed.invert(axis=0) + + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + series = self.validate_df(df) + return DualAxisIndexer( + row_index=BooleanIndexer(series, axis=0), + col_index=BooleanIndexer(True, axis=1) + ) + + @abstractmethod + def validate_df(self, df: pd.DataFrame) -> pd.Series: + """ + Validate the DF by returning a boolean series + Args: + df: The DF to validate + + Returns: A boolean Series whose indices correspond to the row indices of the DF. If the Series has the value + True, this means the corresponding row passed the validation + + Example: + If we were for some reason validating that each row contains values higher than any element in the previous + row:: + + 1 2 3 + 4 5 6 + 1 1 1 + + The correct boolean Series to return here would be:: + + True + True + False + """ + + +class DistinctRowValidation(DfRowValidation, scope=ValidationScope.ROW): + def __init__(self, keep: Union[bool, str] = False, **kwargs): + """ + Args: + keep: Refer to the pandas docs: + "first" indicates that duplicates fail the validation except for the first occurrence. + "last" indicates that duplicates fail the validation except for the last occurrence. + False indicates that all duplicates fail the validation + """ + super().__init__(**kwargs) + self.keep = keep + + def prefix(self, warning: ValidationWarning): + return '{{Row {row}}}'.format(**warning.props) + + def validate_df(self, df: pd.DataFrame) -> pd.Series: + # We invert here because pandas gives duplicates a True value but we want them to be False as in "invalid" + return ~df.duplicated(keep=self.keep) diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py old mode 100644 new mode 100755 index a9176bf..cdc3132 --- a/pandas_schema/errors.py +++ b/pandas_schema/errors.py @@ -1,8 +1,20 @@ -class PanSchError(BaseException): +class PanSchError(Exception): """ Base class for all pandas_schema exceptions """ + def __init__(self, message=None): + super().__init__(message) + + +class PanSchIndexError(PanSchError): + """ + Some issue with creating a PandasIndexer + """ + + def __init__(self, message): + super().__init__(message=message) + class PanSchInvalidSchemaError(PanSchError): """ @@ -10,6 +22,12 @@ class PanSchInvalidSchemaError(PanSchError): """ +class PanSchNoIndexError(PanSchInvalidSchemaError): + """ + A validation was provided that has not specified an index + """ + + class PanSchArgumentError(PanSchError): """ An argument passed to a function has an invalid type or value diff --git a/pandas_schema/index.py b/pandas_schema/index.py new file mode 100755 index 0000000..8015ae1 --- /dev/null +++ b/pandas_schema/index.py @@ -0,0 +1,348 @@ +from pandas_schema.errors import PanSchIndexError +from dataclasses import dataclass +from typing import Union, Optional, Any +import numpy as np +import pandas as pd +from enum import Enum +from abc import ABC, abstractmethod, ABCMeta + +# IndexError: only integers, slices (`:`), ellipsis (`...`), np.newaxis (`None`) and integer or boolean arrays are valid indices +IndexValue = Union[np.ndarray, pd.Series, str, int, slice] +""" +A pd index can either be an integer or string, or an array of either. This typing is a bit sketchy because really +a lot of things are accepted here +""" + + +class IndexType(Enum): + POSITION = 0 + LABEL = 1 + + +@dataclass +class AxisIndexer(ABC): + """ + Abstract base class for indexers + Each index represents a particular slice on a particular axis of a DataFrame. In this way, each AxisIndexer + attempts to recreate the behaviour of `df.ix[some_index]` + """ + + index: Any + """ + The index to use, either an integer for position-based indexing, or a string for label-based indexing + """ + + axis: int + """ + The axis for the indexer + """ + + @abstractmethod + def __call__(self, df: pd.DataFrame): + """ + Apply this index + :param df: The DataFrame to index + :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column + """ + + @abstractmethod + def for_loc(self, df: pd.DataFrame): + """ + Returns this index as something that could be passed into df.loc[] + """ + + @abstractmethod + def for_iloc(self, df): + """ + Returns this index as something that could be passed into df.iloc[] + """ + + @abstractmethod + def for_message(self) -> Optional[str]: + """ + Returns a string that could be used to describe this indexer in a human readable way. However, returns None + if this indexer should not be described + """ + + @abstractmethod + def __invert__(self) -> 'AxisIndexer': + """ + Returns an index that is inverted (will return the opposite of what was previously specified) + """ + + +@dataclass(init=False) +class DirectIndexer(AxisIndexer): + """ + A simple indexer that passes its index value directly into loc or iloc. For this reason it support any kind of + indexing, using boolean series, label series, position series, slices etc + """ + + type: IndexType + """ + The type of indexing to use, either 'position' or 'label' + """ + + negate: bool + """ + If yes, return all values that this index does *not* select + """ + # index: IndexValue + """ + The index to use, either an integer for position-based indexing, or a string for label-based indexing + """ + + def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1, negate=False): + self.index = index + self.axis = axis + self.negate = negate + + if typ is not None: + if not isinstance(typ, IndexType): + raise PanSchIndexError('Index must be a subclass of IndexType') + self.type = typ + else: + # If the type isn't provided, guess it based on the datatype of the index + if isinstance(index, slice): + # Slices can be used in either indexer + self.type = IndexType.POSITION + elif isinstance(index, list): + self.type = IndexType.POSITION + elif isinstance(index, pd.Series) and np.issubdtype(index.dtype, np.bool_): + # Boolean series can actually be used in loc or iloc, but let's assume it's only iloc for simplicity + self.type = IndexType.POSITION + elif np.issubdtype(type(index), np.character): + self.type = IndexType.LABEL + elif np.issubdtype(type(index), np.int_): + self.type = IndexType.POSITION + else: + raise PanSchIndexError('The index value was not either an integer or string, or an array of either of ' + 'these') + + def __call__(self, df: pd.DataFrame): + """ + Apply this index + :param df: The DataFrame to index + :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column + """ + if self.type == IndexType.LABEL: + return df.loc(axis=self.axis)[self.index] + elif self.type == IndexType.POSITION: + return df.iloc(axis=self.axis)[self.index] + + def for_loc(self, df: pd.DataFrame): + """ + Returns this index as something that could be passed into df.loc[] + """ + if self.type == IndexType.LABEL: + return self.index + elif self.type == IndexType.POSITION: + return df.axes[self.axis][self.index] + + def for_iloc(self, df): + """ + Returns this index as something that could be passed into df.iloc[] + """ + if self.type == IndexType.LABEL: + return df.axes[self.axis].get_indexer(self.index) + elif self.type == IndexType.POSITION: + return self.index + + def for_message(self) -> Optional[str]: + """ + Returns a string that could be used to describe this indexer in a human readable way. However, returns None + if this indexer should not be described + """ + if self.axis == 0: + prefix = "Row" + else: + prefix = "Column" + + if isinstance(self.index, int): + idx = str(self.index) + elif isinstance(self.index, str): + idx = '"{}"'.format(self.index) + elif isinstance(self.index, slice): + if self.index == slice(None): + # If it's a slice of everything, skip this index + return None + else: + idx = str(self.index) + else: + idx = str(self.index) + + return "{} {}".format(prefix, idx) + + @staticmethod + def invert_index(index: IndexValue): + if isinstance(index, slice) and index.start is None and index.stop is None: + # If this is a None slice, it would previously return everything, so make it return nothing + return [] + elif isinstance(index, list) and len(index) == 0: + # If this is an empty list, it would previously return nothing, so make it return everything + return slice(None) + elif isinstance(index, pd.Series) and np.issubdtype(index.dtype, np.bool_): + # Boolean series have a built-in inversion + return ~index + # elif np.issubdtype(type(index), np.int_): + # # Index series can't be inverted without knowing the original DF + else: + raise PanSchIndexError('Uninvertible type') + + def __invert__(self) -> 'AxisIndexer': + """ + Returns an index that is inverted (will return the opposite of what was previously specified) + """ + return DirectIndexer( + # index=self.invert_index(self.index), + index=self.index, + typ=self.type, + axis=self.axis, + negate=not self.negate + ) + + +BooleanIndexType = Union[pd.Series, bool] + + +class BooleanIndexer(AxisIndexer): + def __init__(self, index: BooleanIndexType, axis: int = 1): + self.index = index + self.axis = axis + + def __invert__(self) -> 'AxisIndexer': + return BooleanIndexer( + index=np.logical_not(self.index), + axis=self.axis + ) + + @property + def direct_index(self): + """ + Converts this indexer's self.index into a value that can be passed directly into iloc[] + """ + # If it's a scalar boolean, we need special values + if np.issubdtype(type(self.index), np.bool_) and np.ndim(self.index) == 0: + if self.index: + return slice(None) + else: + return [] + + # If it's a vector, pandas can deal with it + return self.index + + def __call__(self, df: pd.DataFrame): + """ + Apply this index + :param df: The DataFrame to index + :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column + """ + return df.iloc(axis=self.axis)[self.direct_index] + + def for_loc(self, df: pd.DataFrame): + """ + Returns this index as something that could be passed into df.loc[] + """ + return self.direct_index + + def for_iloc(self, df): + """ + Returns this index as something that could be passed into df.iloc[] + """ + return self.direct_index + + def for_message(self) -> Optional[str]: + """ + Returns a string that could be used to describe this indexer in a human readable way. However, returns None + if this indexer should not be described + """ + if self.axis == 0: + prefix = "Row" + else: + prefix = "Column" + + return "{} {}".format(prefix, str(self.index)) + + +class SubIndexerMeta(ABCMeta): + """ + Metaclass for RowIndexer and ColumnIndexer, allowing then to do instance checks in a more flexible way + """ + + def __init__(cls, *args, axis: int, **kwargs): + super().__init__(*args) + cls.axis = axis + + def __new__(metacls, name, bases, namespace, **kargs): + return super().__new__(metacls, name, bases, namespace) + + @classmethod + def __prepare__(metacls, name, bases, **kwargs): + return super().__prepare__(name, bases, **kwargs) + + def __instancecheck__(self, instance): + # Any AxisIndexer can be considered a ColumnIndexer if it has axis 0 + result = super().__instancecheck__(instance) + if not result and isinstance(instance, AxisIndexer) and instance.axis == self.axis: + return True + else: + return result + + +class RowIndexer(AxisIndexer, axis=0, metaclass=SubIndexerMeta): + pass + + +class ColumnIndexer(AxisIndexer, axis=1, metaclass=SubIndexerMeta): + pass + + +class DirectRowIndexer(DirectIndexer): + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=0) + + +class DirectColumnIndexer(DirectIndexer): + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=1) + + +@dataclass(init=False) +class DualAxisIndexer: + """ + Completely specifies some subset of a DataFrame, using both axes + """ + row_index: RowIndexer + col_index: ColumnIndexer + + def __init__(self, row_index: Union[RowIndexer, IndexValue], col_index: Union[ColumnIndexer, IndexValue]): + # Use the validation and automatic conversion built into the AxisIndexer class to handle these inputs + if isinstance(row_index, RowIndexer): + self.row_index = row_index + else: + self.row_index = DirectIndexer(index=row_index, axis=0) + + if isinstance(col_index, ColumnIndexer): + self.col_index = col_index + else: + self.col_index = DirectIndexer(index=col_index, axis=1) + + def __call__(self, df: pd.DataFrame): + return df.loc[self.row_index.for_loc(df), self.col_index.for_loc(df)] + + def invert(self, axis) -> 'AxisIndexer': + """ + Returns an index that is inverted along the given axis. e.g. if axis=0, the column index stays the same, but + all row indices are inverted. + """ + if axis == 0: + return DualAxisIndexer( + row_index=~self.row_index, + col_index=self.col_index + ) + + elif axis == 1: + return DualAxisIndexer( + row_index=self.row_index, + col_index=~self.col_index + ) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py old mode 100644 new mode 100755 index 5c0442e..b777881 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -1,9 +1,10 @@ import pandas as pd import typing -from .errors import PanSchInvalidSchemaError, PanSchArgumentError -from .validation_warning import ValidationWarning -from .column import Column +from pandas_schema.core import BaseValidation +from pandas_schema.errors import PanSchArgumentError, PanSchInvalidSchemaError +from pandas_schema.validation_warning import ValidationWarning +from pandas_schema.index import AxisIndexer class Schema: @@ -11,83 +12,32 @@ class Schema: A schema that defines the columns required in the target DataFrame """ - def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): + def __init__(self, validations: typing.Iterable[BaseValidation]): """ - :param columns: A list of column objects - :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring - the header names. False if the columns should be associated by column header names only. Defaults to False + :param validations: A list of validations that will be applied to the DataFrame upon validation """ - if not columns: - raise PanSchInvalidSchemaError('An instance of the schema class must have a columns list') + if not validations: + raise PanSchInvalidSchemaError('An instance of the schema class must have a validations list') - if not isinstance(columns, typing.List): - raise PanSchInvalidSchemaError('The columns field must be a list of Column objects') + if not isinstance(validations, typing.Iterable): + raise PanSchInvalidSchemaError('The columns field must be an iterable of Validation objects') - if not isinstance(ordered, bool): - raise PanSchInvalidSchemaError('The ordered field must be a boolean') + self.validations = list(validations) - self.columns = list(columns) - self.ordered = ordered - - def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing.List[ValidationWarning]: + def validate(self, df: pd.DataFrame, subset: AxisIndexer = None) -> typing.List[ValidationWarning]: """ Runs a full validation of the target DataFrame using the internal columns list :param df: A pandas DataFrame to validate - :param columns: A list of columns indicating a subset of the schema that we want to validate + :param subset: A list of columns indicating a subset of the schema that we want to validate. Can be any :return: A list of ValidationWarning objects that list the ways in which the DataFrame was invalid """ - errors = [] - df_cols = len(df.columns) - - # If no columns are passed, validate against every column in the schema. This is the default behaviour - if columns is None: - schema_cols = len(self.columns) - columns_to_pair = self.columns - if df_cols != schema_cols: - errors.append( - ValidationWarning( - 'Invalid number of columns. The schema specifies {}, but the data frame has {}'.format( - schema_cols, - df_cols) - ) - ) - return errors - - # If we did pass in columns, check that they are part of the current schema - else: - if set(columns).issubset(self.get_column_names()): - columns_to_pair = [column for column in self.columns if column.name in columns] - else: - raise PanSchArgumentError( - 'Columns {} passed in are not part of the schema'.format(set(columns).difference(self.columns)) - ) - - # We associate the column objects in the schema with data frame series either by name or by position, depending - # on the value of self.ordered - if self.ordered: - series = [x[1] for x in df.iteritems()] - column_pairs = zip(series, self.columns) - else: - column_pairs = [] - for column in columns_to_pair: - - # Throw an error if the schema column isn't in the data frame - if column.name not in df: - errors.append(ValidationWarning( - 'The column {} exists in the schema but not in the data frame'.format(column.name))) - return errors + # Apply the subset if we have one + if subset is not None: + df = subset(df) - column_pairs.append((df[column.name], column)) - - # Iterate over each pair of schema columns and data frame series and run validations - for series, column in column_pairs: - errors += column.validate(series) - - return sorted(errors, key=lambda e: e.row) - - def get_column_names(self): - """ - Returns the column names contained in the schema - """ - return [column.name for column in self.columns] + # Build the list of errors + errors = [] + for validation in self.validations: + errors.extend(validation.validate(df)) + return errors diff --git a/pandas_schema/scope.py b/pandas_schema/scope.py new file mode 100644 index 0000000..ccc66b6 --- /dev/null +++ b/pandas_schema/scope.py @@ -0,0 +1,13 @@ +import enum + +class ValidationScope(enum.Enum): + """ + Defines the scope of a validation, ie DATA_FRAME scope means this validation validates the entire DataFrame is + valid or invalid, SERIES means each series can be valid/invalid, and CELL means each index anywhere in the frame + can be valid/invalid + """ + DATA_FRAME = 0 + SERIES = 1 + CELL = 2 + ROW = 3 + diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py old mode 100644 new mode 100755 index 320be65..55fad7e --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -1,22 +1,72 @@ +from dataclasses import dataclass, field + +@dataclass class ValidationWarning: """ - Represents a difference between the schema and data frame, found during the validation of the data frame + Represents a difference between the schema and data frame, found during the validation + of the data frame + """ + validation: 'pandas_schema.core.BaseValidation' + """ + The validation that spawned this warning + """ + + props: dict = field(default_factory=dict) + """ + List of data about this warning in addition to that provided by the validation, for + example, if a cell in the DataFrame didn't match the validation, the props might + include a `value` key, for storing what the actual value was """ - def __init__(self, message: str, value: str = None, row: int = -1, column: str = None): - self.message = message - self.value = value - """The value of the failing cell in the DataFrame""" - self.row = row - """The row index (usually an integer starting from 0) of the cell that failed the validation""" - self.column = column - """The column name of the cell that failed the validation""" + def __init__(self, validation, **props): + self.validation = validation + self.props = props - def __str__(self) -> str: + @property + def message(self) -> str: """ - The entire warning message as a string + Return this validation as a string """ - if self.row is not None and self.column is not None and self.value is not None: - return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row, self.column, self.value, self.message) - else: - return self.message + # Internally, this actually asks the validator class to formulate a message + return self.validation.message(self) + + @property + def prefix(self) -> str: + return self.validation.prefix(self) + + @property + def suffix(self) -> str: + return self.validation.suffix(self) + + def __str__(self): + return self.message + + +class CombinedValidationWarning(ValidationWarning): + """ + Warning for a CombinedValidation, which itself wraps 2 other Warnings from child Validations + """ + left: ValidationWarning + right: ValidationWarning + + def __init__(self, left: ValidationWarning, right: ValidationWarning, **kwargs): + super().__init__(**kwargs) + self.left = left + self.right = right + + @property + def message(self): + """ + Return this validation as a string + """ + # Unlike a normal ValidationWarning, this doesn't ask CombinedValidation for a message, it just combines + # existing messages + return '{} {} and {}'.format(self.left.prefix, self.left.suffix, self.right.suffix) + + @property + def suffix(self) -> str: + return '{} and {}'.format(self.left.suffix, self.right.suffix) + + @property + def prefix(self) -> str: + return self.left.prefix diff --git a/pandas_schema/validation.py b/pandas_schema/validations.py old mode 100644 new mode 100755 similarity index 53% rename from pandas_schema/validation.py rename to pandas_schema/validations.py index 5f7c763..27792b3 --- a/pandas_schema/validation.py +++ b/pandas_schema/validations.py @@ -7,152 +7,21 @@ import operator from . import column +from .core import SeriesValidation, IndexValidation, IsEmptyValidation +from .index import DualAxisIndexer, IndexValue from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype +from pandas_schema.scope import ValidationScope -class _BaseValidation: - """ - The validation base class that defines any object that can create a list of errors from a Series - """ - __metaclass__ = abc.ABCMeta - - @abc.abstractmethod - def get_errors(self, series: pd.Series, column: 'column.Column') -> typing.Iterable[ValidationWarning]: - """ - Return a list of errors in the given series - :param series: - :param column: - :return: - """ - - -class _SeriesValidation(_BaseValidation): - """ - Implements the _BaseValidation interface by returning a Boolean series for each element that either passes or - fails the validation - """ - __metaclass__ = abc.ABCMeta - - def __init__(self, **kwargs): - self._custom_message = kwargs.get('message') - - @property - def message(self): - return self._custom_message or self.default_message - - @abc.abstractproperty - def default_message(self) -> str: - """ - Create a message to be displayed whenever this validation fails - This should be a generic message for the validation type, but can be overwritten if the user provides a - message kwarg - """ - - @abc.abstractmethod - def validate(self, series: pd.Series) -> pd.Series: - """ - Returns a Boolean series, where each value of False is an element in the Series that has failed the validation - :param series: - :return: - """ - - def __invert__(self): - """ - Returns a negated version of this validation - """ - return _InverseValidation(self) - - def __or__(self, other: '_SeriesValidation'): - """ - Returns a validation which is true if either this or the other validation is true - """ - return _CombinedValidation(self, other, operator.or_) - - def __and__(self, other: '_SeriesValidation'): - """ - Returns a validation which is true if either this or the other validation is true - """ - return _CombinedValidation(self, other, operator.and_) - - def get_errors(self, series: pd.Series, column: 'column.Column'): - - errors = [] - - # Calculate which columns are valid using the child class's validate function, skipping empty entries if the - # column specifies to do so - simple_validation = ~self.validate(series) - if column.allow_empty: - # Failing results are those that are not empty, and fail the validation - # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is - if is_categorical_dtype(series) or is_numeric_dtype(series): - validated = ~series.isnull() & simple_validation - else: - validated = (series.str.len() > 0) & simple_validation - - else: - validated = simple_validation - - # Cut down the original series to only ones that failed the validation - indices = series.index[validated] - - # Use these indices to find the failing items. Also print the index which is probably a row number - for i in indices: - element = series[i] - errors.append(ValidationWarning( - message=self.message, - value=element, - row=i, - column=series.name - )) - - return errors - - -class _InverseValidation(_SeriesValidation): - """ - Negates an ElementValidation - """ - - def __init__(self, validation: _SeriesValidation): - self.negated = validation - super().__init__() - - def validate(self, series: pd.Series): - return ~ self.negated.validate(series) - - @property - def default_message(self): - return self.negated.message + ' ' - - -class _CombinedValidation(_SeriesValidation): - """ - Validates if one and/or the other validation is true for an element - """ - - def __init__(self, validation_a: _SeriesValidation, validation_b: _SeriesValidation, operator): - self.operator = operator - self.v_a = validation_a - self.v_b = validation_b - super().__init__() - - def validate(self, series: pd.Series): - return self.operator(self.v_a.validate(series), self.v_b.validate(series)) - - @property - def default_message(self): - return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) - - -class CustomSeriesValidation(_SeriesValidation): +class CustomSeriesValidation(SeriesValidation): """ Validates using a user-provided function that operates on an entire series (for example by using one of the pandas Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) """ - def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): + def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], *args, **kwargs): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -162,19 +31,19 @@ def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal to True if the object passed validation, and False if it failed """ + super().__init__(*args, **kwargs) self._validation = validation - super().__init__(message=message) - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return self._validation(series) -class CustomElementValidation(_SeriesValidation): +class CustomElementValidation(SeriesValidation): """ Validates using a user-provided function that operates on each element """ - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], *args, **kwargs): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -185,13 +54,13 @@ def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], messag the validation, and false if it doesn't """ self._validation = validation - super().__init__(message=message) + super().__init__(*args, **kwargs) - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.apply(self._validation) -class InRangeValidation(_SeriesValidation): +class InRangeValidation(SeriesValidation): """ Checks that each element in the series is within a given numerical range """ @@ -205,16 +74,15 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'was not in the range [{}, {})'.format(self.min, self.max) - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: series = pd.to_numeric(series, errors="coerce") return (series >= self.min) & (series < self.max) -class IsDtypeValidation(_BaseValidation): +class IsDtypeValidation(SeriesValidation, scope=ValidationScope.SERIES): """ Checks that a series has a certain numpy dtype """ @@ -223,21 +91,34 @@ def __init__(self, dtype: np.dtype, **kwargs): """ :param dtype: The numpy dtype to check the column against """ - self.dtype = dtype super().__init__(**kwargs) + self.dtype = dtype + + def default_message(self, warning: ValidationWarning) -> str: + return 'has a dtype of {} which is not a subclass of the required type {}'.format( + self.dtype, warning.props['dtype']) - def get_errors(self, series: pd.Series, column: 'column.Column' = None): - if not np.issubdtype(series.dtype, self.dtype): - return [ValidationWarning( - 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype - ) - )] + def make_series_warning(self, df: pd.DataFrame, column: str, series: pd.Series) -> ValidationWarning: + return ValidationWarning( + self, + column=column, + dtype=series.dtype + ) + + def validate_series(self, series: pd.Series): + if np.issubdtype(series.dtype, self.dtype): + return True else: - return [] + return False + # return [ValidationWarning( + # self, + # {'dtype': series.dtype} + # )] + # else: + # return [] -class CanCallValidation(_SeriesValidation): +class CanCallValidation(SeriesValidation): """ Validates if a given function can be called on each element in a column without raising an exception """ @@ -250,12 +131,14 @@ def __init__(self, func: typing.Callable, **kwargs): if callable(type): self.callable = func else: - raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) + raise PanSchArgumentError( + 'The object "{}" passed to CanCallValidation is not callable!'.format( + type)) super().__init__(**kwargs) - @property - def default_message(self): - return 'raised an exception when the callable {} was called on it'.format(self.callable) + def default_message(self, warning: ValidationWarning): + return 'raised an exception when the callable {} was called on it'.format( + self.callable) def can_call(self, var): try: @@ -264,7 +147,7 @@ def can_call(self, var): except: return False - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.apply(self.can_call) @@ -288,12 +171,11 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'cannot be converted to type {}'.format(self.callable) -class MatchesPatternValidation(_SeriesValidation): +class MatchesPatternValidation(SeriesValidation): """ Validates that a string or regular expression can match somewhere in each element in this column """ @@ -308,15 +190,14 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - @property - def default_message(self): - return 'does not match the pattern "{}"'.format(self.pattern) + def default_message(self, warning: ValidationWarning): + return 'does not match the pattern "{}"'.format(self.pattern.pattern) - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.astype(str).str.contains(self.pattern, **self.options) -class TrailingWhitespaceValidation(_SeriesValidation): +class TrailingWhitespaceValidation(SeriesValidation): """ Checks that there is no trailing whitespace in this column """ @@ -324,15 +205,14 @@ class TrailingWhitespaceValidation(_SeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains trailing whitespace' - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return ~series.astype(str).str.contains('\s+$') -class LeadingWhitespaceValidation(_SeriesValidation): +class LeadingWhitespaceValidation(SeriesValidation): """ Checks that there is no leading whitespace in this column """ @@ -340,15 +220,14 @@ class LeadingWhitespaceValidation(_SeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains leading whitespace' - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return ~series.astype(str).str.contains('^\s+') -class IsDistinctValidation(_SeriesValidation): +class IsDistinctValidation(SeriesValidation): """ Checks that every element of this column is different from each other element """ @@ -356,15 +235,14 @@ class IsDistinctValidation(_SeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains values that are not unique' - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return ~series.duplicated(keep='first') -class InListValidation(_SeriesValidation): +class InListValidation(SeriesValidation): """ Checks that each element in this column is contained within a list of possibilities """ @@ -378,19 +256,18 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) - return 'is not in the list of legal options ({})'.format(values) + return 'was not in the list of legal options [{}]'.format(values) - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: if self.case_sensitive: return series.isin(self.options) else: return series.str.lower().isin([s.lower() for s in self.options]) -class DateFormatValidation(_SeriesValidation): +class DateFormatValidation(SeriesValidation): """ Checks that each element in this column is a valid date according to a provided format string """ @@ -404,8 +281,7 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): @@ -415,5 +291,7 @@ def valid_date(self, val): except: return False - def validate(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.astype(str).apply(self.valid_date) + + diff --git a/pandas_schema/version.py b/pandas_schema/version.py old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py index 316cb58..e78c8c3 100755 --- a/setup.py +++ b/setup.py @@ -82,7 +82,14 @@ def run(self): ], keywords='pandas csv verification schema', packages=find_packages(include=['pandas_schema']), - install_requires=['numpy', 'pandas>=0.19'], + install_requires=[ + 'numpy', + 'pandas>=0.23', + 'dataclasses' + ], + extras_requires={ + 'dev': ['pytest'], + }, cmdclass={ 'build_readme': BuildReadme, 'build_site': BuildHtmlDocs diff --git a/test/__init__.py b/test/__init__.py old mode 100644 new mode 100755 index 27a4caa..d59a7f6 --- a/test/__init__.py +++ b/test/__init__.py @@ -1,5 +1,5 @@ from test.test_example import * -from test.test_column import * +# from test.test_column import * from test.test_schema import * from test.test_validation import * from test.test_validation_warning import * diff --git a/test/test_column.py b/test/test_column.py deleted file mode 100644 index 38e61f0..0000000 --- a/test/test_column.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest -import pandas as pd - -from pandas_schema import Column -from pandas_schema.validation import CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation - - -class SingleValidationColumn(unittest.TestCase): - """ - Test a column with one single validation - """ - NAME = 'col1' - - col = Column(NAME, [CanConvertValidation(int)], allow_empty=False) - ser = pd.Series([ - 'a', - 'b', - 'c' - ]) - - def test_name(self): - self.assertEqual(self.col.name, self.NAME, 'A Column does not store its name correctly') - - def test_outputs(self): - results = self.col.validate(self.ser) - - self.assertEqual(len(results), len(self.ser), 'A Column produces the wrong number of errors') - for i in range(2): - self.assertTrue(any([r.row == i for r in results]), 'A Column does not report errors for every row') - - -class DoubleValidationColumn(unittest.TestCase): - """ - Test a column with two different validations - """ - NAME = 'col1' - - col = Column(NAME, [TrailingWhitespaceValidation(), LeadingWhitespaceValidation()], allow_empty=False) - ser = pd.Series([ - ' a ', - ' b ', - ' c ' - ]) - - def test_outputs(self): - results = self.col.validate(self.ser) - - # There should be 6 errors, 2 for each row - self.assertEqual(len(results), 2 * len(self.ser), 'A Column produces the wrong number of errors') - for i in range(2): - in_row = [r for r in results if r.row == i] - self.assertEqual(len(in_row), 2, 'A Column does not report both errors for every row') - - -class AllowEmptyColumn(unittest.TestCase): - """ - Test a column with one single validation that allows empty columns - """ - NAME = 'col1' - - col = Column(NAME, [CanConvertValidation(int)], allow_empty=True) - ser = pd.Series([ - '', - ]) - - def test_outputs(self): - results = self.col.validate(self.ser) - self.assertEqual(len(results), 0, 'allow_empty is not allowing empty columns') diff --git a/test/test_combined_validation.py b/test/test_combined_validation.py new file mode 100644 index 0000000..4c20f66 --- /dev/null +++ b/test/test_combined_validation.py @@ -0,0 +1,145 @@ +import json +import unittest +import re +import math + +from numpy import nan, dtype +import numpy as np +import pandas as pd + +from pandas_schema.validations import * +from pandas_schema.core import CombinedValidation, BaseValidation +from pandas_schema.index import ColumnIndexer as ci +from pandas_schema.schema import Schema +from pandas_schema.column import column, column_sequence +from pandas_schema import ValidationWarning + +from .util import get_warnings + + +class Or(unittest.TestCase): + """ + Tests the | operator on two MatchesPatternValidations + """ + + def setUp(self): + self.validator = MatchesPatternValidation( + 'yes', index=0 + ) | MatchesPatternValidation( + 'pass', index=0 + ) + + def test_valid_items(self): + warnings = get_warnings(self.validator, [ + 'pass', + 'yes', + 'passyes', + '345yes345' + ]) + assert len(warnings) == 0, 'rejects values that should pass' + + def test_invalid_items(self): + warnings = get_warnings(self.validator, [ + 'fail', + 'YES', + 'YPESS' + ]) + + assert len(warnings) == 3, 'accepts values that should pass' + + +class NumericAndOr(unittest.TestCase): + """ + Tests a more complex case where we have an "or" and then an "and". This schema allows either numbers + represented as either digits or words + """ + validator = InListValidation(['one', 'two', 'three'], index=0) | ( + IsDtypeValidation(np.int_, index=0) & InRangeValidation(1, 4, index=0) + ) + + def test_passing_words(self): + warnings = get_warnings(self.validator, [ + 'one', + 'two', + 'three' + ]) + assert len(warnings) == 0 + + def test_failing_words(self): + warnings = get_warnings(self.validator, [ + 'four', + 'five', + 'six' + ]) + assert len(warnings) == 3 + + def test_passing_numbers(self): + warnings = get_warnings(self.validator, [ + 1, + 2, + 3 + ]) + assert len(warnings) == 0 + + def test_failing_numbers(self): + warnings = get_warnings(self.validator, pd.Series([ + 4, + 5, + 6 + ], dtype=np.int_)) + assert len(warnings) == 3 + for warning in warnings: + print(warning.message) + + +class DateAndOr(unittest.TestCase): + """ + Allows days of the week as either numbers or short words, or long words + """ + # Note: this isn't an actually well-designed validation; the two InLists should really be one validation. + # But here we're testing a somewhat complex validation + validator = column(( + CanConvertValidation(int) & InRangeValidation(min=1, max=8) + ) | ( + CanConvertValidation(str) & InListValidation(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']) + ) | ( + CanConvertValidation(str) & InListValidation([ + 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' + ]) + ), index=0) + + def test_correct(self): + warnings = get_warnings(self.validator, ['Mon', 3, 'Thursday', 1, 'Fri', 6, 7]) + assert len(warnings) == 0, warnings + + def test_incorrect(self): + warnings = get_warnings(self.validator, [0, 8, 'Mondesday', 'Frisday', 'Sund', 'Frid']) + assert len(warnings) == 6, warnings + for warning in warnings: + assert 'CombinedValidation' not in warning.message + +class Optional(unittest.TestCase): + """ + Tests the "optional" method, which Ors the validation with an IsEmptyValidation + """ + validator = InRangeValidation(5, 10, index=0).optional() + + def test_passing(self): + warnings = get_warnings(self.validator, [ + 5, + None, + 6, + None, + 7, + None + ]) + + assert warnings == [], 'is not accepting null values' + + def test_failing(self): + assert len(get_warnings(self.validator, [ + 0, + math.inf, + -1, + 10 + ])) == 4, 'is accepting invalid values' diff --git a/test/test_df_validations.py b/test/test_df_validations.py new file mode 100644 index 0000000..67db474 --- /dev/null +++ b/test/test_df_validations.py @@ -0,0 +1,62 @@ +import pandas as pd +import pytest + +from pandas_schema import ValidationWarning +from pandas_schema.df_validations import DistinctRowValidation +from pandas.testing import assert_series_equal + + +@pytest.mark.parametrize(['df', 'result', 'kwargs'], [ + [ + # By default, all duplicates should be marked + pd.DataFrame([ + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ]), + [ + False, False, False, False + ], + dict() + ], + [ + # With keep='first', the first duplicates are okay + pd.DataFrame([ + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ]), + [ + True, True, False, False + ], + dict(keep='first') + ], + [ + # With keep='last', the last duplicates are okay + pd.DataFrame([ + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ]), + [ + False, False, True, True + ], + dict(keep='last') + ] +]) +def test_distinct_row_validation(df, result, kwargs): + validator = DistinctRowValidation(**kwargs) + + # Test the internal validation that produces a Series + series = validator.validate_df(df) + assert_series_equal(series, pd.Series(result)) + + # Test the public method that returns warnings + # The number of warnings should be equal to the number of failures + warnings = validator.validate(df) + assert len(warnings) == result.count(False) + assert isinstance(warnings[0], ValidationWarning) + diff --git a/test/test_example.py b/test/test_example.py old mode 100644 new mode 100755 diff --git a/test/test_metadata.py b/test/test_metadata.py old mode 100644 new mode 100755 diff --git a/test/test_schema.py b/test/test_schema.py old mode 100644 new mode 100755 index ae7b337..4461c62 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -3,16 +3,16 @@ import pandas as pd from numpy.core.multiarray import dtype -from pandas_schema import Schema, Column -from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation +from pandas_schema.schema import Schema +from pandas_schema.validations import LeadingWhitespaceValidation, IsDtypeValidation from pandas_schema.errors import PanSchArgumentError class UnorderedSchema(unittest.TestCase): - schema = Schema([ - Column('a'), - Column('b', [LeadingWhitespaceValidation()]) - ], ordered=False) + # schema = Schema([ + # Column('a'), + # Column('b', [LeadingWhitespaceValidation()]) + # ], ordered=False) def test_fields(self): self.assertEqual(len(self.schema.columns), 2, 'The schema is not storing all of its columns') @@ -138,10 +138,10 @@ def test_column_subset_error(self): class OrderedSchema(unittest.TestCase): - schema = Schema([ - Column('a', [LeadingWhitespaceValidation()]), - Column('b') - ], ordered=True) + # schema = Schema([ + # Column('a', [LeadingWhitespaceValidation()]), + # Column('b') + # ], ordered=True) def test_mixed_columns(self): """ diff --git a/test/test_validation.py b/test/test_validation.py old mode 100644 new mode 100755 index fc40100..a496b63 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -1,44 +1,29 @@ +""" +Tests for pandas_schema.validations +""" import json import unittest import re from numpy import nan, dtype - -from pandas_schema import Column, Schema -from pandas_schema.validation import _BaseValidation -from pandas_schema.validation import * +import numpy as np +import pandas as pd + +from pandas_schema.validations import * +from pandas_schema.core import CombinedValidation, BaseValidation +from pandas_schema.index import DirectColumnIndexer as ci +from pandas_schema.schema import Schema +from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning +from .util import get_warnings + class ValidationTestBase(unittest.TestCase): def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): if not s1.equals(s2): raise self.failureException(msg) - def validate_and_compare(self, series: list, expected_result: bool, msg: str = None, series_dtype: object = None): - """ - Checks that every element in the provided series is equal to `expected_result` after validation - :param series_dtype: Explicity specifies the dtype for the generated Series - :param series: The series to check - :param expected_result: Whether the elements in this series should pass the validation - :param msg: The message to display if this test fails - """ - - # Check that self.validator is correct - if not self.validator or not isinstance(self.validator, _BaseValidation): - raise ValueError('The class must have the validator field set to an instance of a Validation subclass') - - # Ensure we're comparing series correctly - self.addTypeEqualityFunc(pd.Series, self.seriesEquality) - - # Convert the input list to a series and validate it - results = self.validator.validate(pd.Series(series, dtype=series_dtype)) - - # Now find any items where their validation does not correspond to the expected_result - for item, result in zip(series, results): - with self.subTest(value=item): - self.assertEqual(result, expected_result, msg) - class CustomSeries(ValidationTestBase): """ @@ -46,13 +31,19 @@ class CustomSeries(ValidationTestBase): """ def setUp(self): - self.validator = CustomSeriesValidation(lambda s: ~s.str.contains('fail'), 'contained the word fail') + self.validator = CustomSeriesValidation( + lambda s: ~s.str.contains('fail'), + message='contained the word fail', + index=0 + ) def test_valid_inputs(self): - self.validate_and_compare(['good', 'success'], True, 'did not accept valid inputs') + assert len(get_warnings(self.validator, ['good', + 'success'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - self.validate_and_compare(['fail', 'failure'], False, 'accepted invalid inputs') + assert len(get_warnings(self.validator, + ['fail', 'failure'])) == 2, 'accepted invalid inputs' class CustomElement(ValidationTestBase): @@ -61,13 +52,21 @@ class CustomElement(ValidationTestBase): """ def setUp(self): - self.validator = CustomElementValidation(lambda s: s.startswith('_start_'), "Didn't begin with '_start_'") + self.validator = CustomElementValidation( + lambda s: s.startswith('_start_'), + message="Didn't begin with '_start_'", + index=0 + ) def test_valid_inputs(self): - self.validate_and_compare(['_start_sdiyhsd', '_start_234fpwunxc\n'], True, 'did not accept valid inputs') + assert len( + get_warnings(self.validator, ['_start_sdiyhsd', '_start_234fpwunxc\n']) + ) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - self.validate_and_compare(['fail', '324wfp9ni'], False, 'accepted invalid inputs') + assert len( + get_warnings(self.validator, ['fail', '324wfp9ni']) + ) == 2, 'accepted invalid inputs' class LeadingWhitespace(ValidationTestBase): @@ -76,43 +75,31 @@ class LeadingWhitespace(ValidationTestBase): """ def setUp(self): - self.validator = LeadingWhitespaceValidation() + self.validator = LeadingWhitespaceValidation(index=0) def test_validate_trailing_whitespace(self): - self.validate_and_compare( - [ - 'trailing space ', - 'trailing tabs ', - '''trailing newline - ''' - ], - True, - 'is incorrectly failing on trailing whitespace' - ) + assert len(get_warnings(self.validator, [ + 'trailing space ', + 'trailing tabs ', + '''trailing newline + ''' + ])) == 0, 'is incorrectly failing on trailing whitespace' def test_validate_leading_whitespace(self): - self.validate_and_compare( - [ - ' leading spaces', - ' leading tabs', - ''' - leading newline''', - ], - False, - 'does not detect leading whitespace' - ) + assert len(get_warnings(self.validator, [ + ' leading spaces', + ' leading tabs', + ''' + leading newline''', + ])) == 3, 'does not detect leading whitespace' def test_validate_middle_whitespace(self): - self.validate_and_compare( - [ - 'middle spaces', - 'middle tabs', - '''middle - newline''', - ], - True, - 'is incorrectly failing on central whitespace' - ) + assert len(get_warnings(self.validator, [ + 'middle spaces', + 'middle tabs', + '''middle + newline''', + ])) == 0, 'is incorrectly failing on central whitespace' class TrailingWhitespace(ValidationTestBase): @@ -121,44 +108,32 @@ class TrailingWhitespace(ValidationTestBase): """ def setUp(self): - self.validator = TrailingWhitespaceValidation() + self.validator = TrailingWhitespaceValidation(index=0) super().setUp() def test_validate_trailing_whitespace(self): - self.validate_and_compare( - [ - 'trailing space ', - 'trailing tabs ', - '''trailing newline - ''' - ], - False, - 'is not detecting trailing whitespace' - ) + assert len(get_warnings(self.validator, [ + 'trailing space ', + 'trailing tabs ', + '''trailing newline + ''' + ])) == 3, 'is not detecting trailing whitespace' def test_validate_leading_whitespace(self): - self.validate_and_compare( - [ - ' leading spaces', - ' leading tabs', - ''' - leading newline''', - ], - True, - 'is incorrectly failing on leading whitespace' - ) + assert len(get_warnings(self.validator, [ + ' leading spaces', + ' leading tabs', + ''' + leading newline''', + ])) == 0, 'is incorrectly failing on leading whitespace' def test_validate_middle_whitespace(self): - self.validate_and_compare( - [ - 'middle spaces', - 'middle tabs', - '''middle - newline''', - ], - True, - 'is incorrectly failing on central whitespace' - ) + assert len(get_warnings(self.validator, [ + 'middle spaces', + 'middle tabs', + '''middle + newline''', + ])) == 0, 'is incorrectly failing on central whitespace' class CanCallJson(ValidationTestBase): @@ -167,29 +142,21 @@ class CanCallJson(ValidationTestBase): """ def setUp(self): - self.validator = CanCallValidation(json.loads) + self.validator = CanCallValidation(json.loads, index=0) def test_validate_valid_json(self): - self.validate_and_compare( - [ - '[1, 2, 3]', - '{"a": 1.1, "b": 2.2, "c": 3.3}', - '"string"' - ], - True, - 'is incorrectly failing on valid JSON' - ) + assert len(get_warnings(self.validator, [ + '[1, 2, 3]', + '{"a": 1.1, "b": 2.2, "c": 3.3}', + '"string"' + ])) == 0, 'is incorrectly failing on valid JSON' def test_validate_invalid_json(self): - self.validate_and_compare( - [ - '[1, 2, 3', - '{a: 1.1, b: 2.2, c: 3.3}', - 'string' - ], - False, - 'is not detecting invalid JSON' - ) + assert len(get_warnings(self.validator, [ + '[1, 2, 3', + '{a: 1.1, b: 2.2, c: 3.3}', + 'string' + ])) == 3, 'is not detecting invalid JSON' class CanCallLambda(ValidationTestBase): @@ -199,29 +166,22 @@ class CanCallLambda(ValidationTestBase): def setUp(self): # Succeed if it's divisible by 2, otherwise cause an error - self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0) + self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, + index=0) def test_validate_noerror(self): - self.validate_and_compare( - [ - 2, - 4, - 6 - ], - True, - 'is incorrectly failing on even numbers' - ) + assert len(get_warnings(self.validator, [ + 2, + 4, + 6 + ])) == 0, 'is incorrectly failing on even numbers' def test_validate_error(self): - self.validate_and_compare( - [ - 1, - 3, - 5 - ], - False, - 'should fail on odd numbers' - ) + assert len(get_warnings(self.validator, [ + 1, + 3, + 5 + ])) == 3, 'should fail on odd numbers' class CanConvertInt(ValidationTestBase): @@ -230,176 +190,129 @@ class CanConvertInt(ValidationTestBase): """ def setUp(self): - self.validator = CanConvertValidation(int) + self.validator = CanConvertValidation(int, index=0) def test_valid_int(self): - self.validate_and_compare( - [ - '1', - '10', - '999', - '99999' - ], - True, - 'does not accept valid integers' - ) + assert len(get_warnings(self.validator, [ + '1', + '10', + '999', + '99999' + ])) == 0, 'does not accept valid integers' def test_invalid_int(self): - self.validate_and_compare( - [ - '1.0', - '9.5', - 'abc', - '1e-6' - ], - False, - 'accepts invalid integers' - ) + assert len(get_warnings(self.validator, [ + '1.0', + '9.5', + 'abc', + '1e-6' + ])) == 4, 'accepts invalid integers' class InListCaseSensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c']) + self.validator = InListValidation(['a', 'b', 'c'], index=0) def test_valid_elements(self): - self.validate_and_compare( - [ - 'a', - 'b', - 'c' - ], - True, - 'does not accept elements that are in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'a', + 'b', + 'c' + ])) == 0, 'does not accept elements that are in the validation list' def test_invalid_elements(self): - self.validate_and_compare( - [ - 'aa', - 'bb', - 'd', - 'A', - 'B', - 'C' - ], - False, - 'accepts elements that are not in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'aa', + 'bb', + 'd', + 'A', + 'B', + 'C' + ])) == 6, 'accepts elements that are not in the validation list' class InListCaseInsensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, + index=0) def test_valid_elements(self): - self.validate_and_compare( - [ - 'a', - 'b', - 'c', - 'A', - 'B', - 'C' - ], - True, - 'does not accept elements that are in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'a', + 'b', + 'c', + 'A', + 'B', + 'C' + ])) == 0, 'does not accept elements that are in the validation list' def test_invalid_elements(self): - self.validate_and_compare( - [ - 'aa', - 'bb', - 'd', - ], - False, - 'accepts elements that are not in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'aa', + 'bb', + 'd', + ])) == 3, 'accepts elements that are not in the validation list' class DateFormat(ValidationTestBase): def setUp(self): - self.validator = DateFormatValidation('%Y%m%d') + self.validator = DateFormatValidation('%Y%m%d', index=0) def test_valid_dates(self): - self.validate_and_compare( - [ - '20160404', - '00011212' - ], - True, - 'does not accept valid dates' - ) + assert len(get_warnings(self.validator, [ + '20160404', + '00011212' + ])) == 0, 'does not accept valid dates' def test_invalid_dates(self): - self.validate_and_compare( - [ - '1/2/3456', - 'yyyymmdd', - '11112233' - ], - False, - 'accepts invalid dates' - ) + assert len(get_warnings(self.validator, [ + '1/2/3456', + 'yyyymmdd', + '11112233' + ])) == 3, 'accepts invalid dates' class StringRegexMatch(ValidationTestBase): def setUp(self): - self.validator = MatchesPatternValidation('^.+\.txt$') + self.validator = MatchesPatternValidation(r'^.+\.txt$', index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - 'pass.txt', - 'a.txt', - 'lots of words.txt' - ], - True, - 'does not accept strings matching the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txt', + 'a.txt', + 'lots of words.txt' + ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - self.validate_and_compare( - [ - 'pass.TXT', - '.txt', - 'lots of words.tx' - ], - False, - 'accepts strings that do not match the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.TXT', + '.txt', + 'lots of words.tx' + ])) == 3, 'accepts strings that do not match the regex' class IsDistinct(ValidationTestBase): def setUp(self): - self.validator = IsDistinctValidation() + self.validator = IsDistinctValidation(index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - '1', - '2', - '3', - '4' - ], - True, - 'does not accept unique strings' - ) + assert len(get_warnings(self.validator, [ + '1', + '2', + '3', + '4' + ])) == 0, 'does not accept unique strings' def test_invalid_strings(self): - validation = self.validator.validate(pd.Series([ + warnings = get_warnings(self.validator, [ '1', '1', '3', '4' - ])) + ]) - self.assertTrue((validation == pd.Series([ - True, - False, - True, - True - ])).all(), 'did not identify the error') + assert len(warnings) == 1 + assert warnings[0].props['row'] == 1, 'did not identify the error' class CompiledRegexMatch(ValidationTestBase): @@ -408,29 +321,33 @@ class CompiledRegexMatch(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE)) + self.validator = MatchesPatternValidation( + re.compile('^.+\.txt$', re.IGNORECASE), index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - 'pass.txt', - 'a.TXT', - 'lots of words.tXt' - ], - True, - 'does not accept strings matching the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txt', + 'a.TXT', + 'lots of words.tXt' + ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - self.validate_and_compare( - [ - 'pass.txtt', - '.txt', - 'lots of words.tx' - ], - False, - 'accepts strings that do not match the regex' - ) + test_data = [ + 'pass.txtt', + '.txt', + 'lots of words.tx' + ] + warnings = get_warnings(self.validator, test_data) + + # Check that every piece of data failed + assert len(warnings) == 3, 'accepts strings that do not match the regex' + + # Also test the messages + for i, (warning, data) in enumerate(zip(warnings, test_data)): + assert 'Row {}'.format(i) in warning.message + assert 'Column 0' in warning.message + assert data in warning.message + assert self.validator.pattern.pattern in warning.message class InRange(ValidationTestBase): @@ -439,50 +356,34 @@ class InRange(ValidationTestBase): """ def setUp(self): - self.validator = InRangeValidation(7, 9) + self.validator = InRangeValidation(7, 9, index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 7, - 8, - 7 - ], - True, - 'does not accept integers in the correct range' - ) + assert len(get_warnings(self.validator, [ + 7, + 8, + 7 + ])) == 0, 'does not accept integers in the correct range' def test_invalid_items(self): - self.validate_and_compare( - [ - 1, - 2, - 3 - ], - False, - 'Incorrectly accepts integers outside of the range' - ) + assert len(get_warnings(self.validator, [ + 1, + 2, + 3 + ])) == 3, 'Incorrectly accepts integers outside of the range' def test_valid_character_items(self): - self.validate_and_compare( - [ - 7, - "8", - 8 - ], - True, - "Does not accept integers provided as a string" - ) + assert len(get_warnings(self.validator, [ + 7, + "8", + 8 + ])) == 0, "Does not accept integers provided as a string" def test_invalid_character_items(self): - self.validate_and_compare( - [ - "seven", - "eight", - ], - False, - "Incorrectly accepts items with non numerical text" - ) + assert len(get_warnings(self.validator, [ + "seven", + "eight", + ])) == 2, "Incorrectly accepts items with non numerical text" class Dtype(ValidationTestBase): @@ -491,25 +392,23 @@ class Dtype(ValidationTestBase): """ def setUp(self): - self.validator = IsDtypeValidation(np.number) + self.validator = IsDtypeValidation(np.number, index=0) def test_valid_items(self): - errors = self.validator.get_errors(pd.Series( - [ - 1, - 2, - 3 - ])) + errors = get_warnings(self.validator, pd.Series([ + 1, + 2, + 3 + ], dtype=np.int_)) self.assertEqual(len(errors), 0) def test_invalid_items(self): - errors = self.validator.get_errors(pd.Series( - [ - 'a', - '', - 'c' - ])) + errors = get_warnings(self.validator, [ + 'a', + '', + 'c' + ]) self.assertEqual(len(errors), 1) self.assertEqual(type(errors[0]), ValidationWarning) @@ -526,84 +425,84 @@ def test_schema(self): }) schema = Schema([ - Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), - Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), - Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), + IsDtypeValidation(dtype('int64'), index=ci('wrong_dtype1')), + IsDtypeValidation(dtype('float64'), index=ci('wrong_dtype2')), + IsDtypeValidation(dtype('int64'), index=ci('wrong_dtype3')), ]) errors = schema.validate(df) - self.assertEqual( - sorted([str(x) for x in errors]), - sorted([ - 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', - 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', - 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' - ]) - ) + assert len(errors) == 3 + for error, correct_dtype in zip(errors, [np.object, np.int64, np.float64]): + assert error.props['dtype'] == correct_dtype -class Negate(ValidationTestBase): - """ - Tests the ~ operator on a MatchesPatternValidation - """ +class IsEmpty(ValidationTestBase): def setUp(self): - self.validator = ~MatchesPatternValidation('fail') + self.validator = IsEmptyValidation(index=0) - def test_valid_items(self): - self.validate_and_compare( - [ - 'Pass', - '1', - 'True' - ], - True, - 'Rejects values that should pass' - ) + def test_valid_items_float(self): + errors = get_warnings(self.validator, pd.Series([ + np.nan, + np.nan + ], dtype=np.float_)) - def test_invalid_items(self): - self.validate_and_compare( - [ - 'fail', - 'thisfails', - 'failure' - ], - False, - 'Accepts values that should pass' - ) + self.assertEqual(len(errors), 0) + + def test_valid_items_str(self): + errors = get_warnings(self.validator, pd.Series([ + '', + '', + '' + ], dtype=np.str_)) + + self.assertEqual(len(errors), 0) + + def test_invalid_items_int(self): + errors = get_warnings(self.validator, pd.Series([ + 0, + 1, + -1 + ], dtype=np.int_)) + + self.assertEqual(len(errors), 3) + self.assertEqual(type(errors[0]), ValidationWarning) + + + def test_invalid_items_str(self): + errors = get_warnings(self.validator, pd.Series([ + 'a', + ' ' + ], dtype=np.str_)) + + self.assertEqual(len(errors), 2) + self.assertEqual(type(errors[0]), ValidationWarning) -class Or(ValidationTestBase): +class Negate(ValidationTestBase): """ - Tests the | operator on two MatchesPatternValidations + Tests the ~ operator on a MatchesPatternValidation """ def setUp(self): - self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass') + self.validator = ~MatchesPatternValidation('fail', index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 'pass', - 'yes', - 'passyes', - '345yes345' - ], - True, - 'Rejects values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'Pass', + '1', + 'True' + ])) == 0, 'Rejects values that should pass' def test_invalid_items(self): - self.validate_and_compare( - [ - 'fail', - 'YES', - 'YPESS' - ], - False, - 'Accepts values that should pass' - ) + warnings = get_warnings(self.validator, [ + 'fail', + 'thisfails', + 'failure' + ]) + + assert len(warnings) == 3, 'Accepts values that should pass' class CustomMessage(ValidationTestBase): @@ -615,28 +514,27 @@ def setUp(self): self.message = "UNUSUAL MESSAGE THAT WOULDN'T BE IN A NORMAL ERROR" def test_default_message(self): - validator = InRangeValidation(min=4) - for error in validator.get_errors(pd.Series( - [ - 1, - 2, - 3 - ] - ), Column('')): - self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') + validator = InRangeValidation(min=4, index=0) + for error in get_warnings(validator, [ + 1, + 2, + 3 + ]): + self.assertNotRegex(error.message, self.message, + 'Validator not using the default warning message!') def test_custom_message(self): - validator = InRangeValidation(min=4, message=self.message) - for error in validator.get_errors(pd.Series( - [ - 1, - 2, - 3 - ] - ), Column('')): - self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') + validator = InRangeValidation(min=4, message=self.message, index=0) + for error in get_warnings(validator, [ + 1, + 2, + 3 + ]): + self.assertRegex(error.message, self.message, + 'Validator not using the custom warning message!') +@unittest.skip('allow_empty no longer exists') class GetErrorTests(ValidationTestBase): """ Tests for float valued columns where allow_empty=True @@ -646,18 +544,18 @@ def setUp(self): self.vals = [1.0, None, 3] def test_in_range_allow_empty_with_error(self): - validator = InRangeValidation(min=4) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=True)) + validator = InRangeValidation(min=4, index=0) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): - validator = InRangeValidation(min=0) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=True)) + validator = InRangeValidation(min=0, index=0) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): - validator = InRangeValidation(min=4) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=False)) + validator = InRangeValidation(min=4, index=0) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), len(self.vals)) @@ -667,24 +565,33 @@ class PandasDtypeTests(ValidationTestBase): """ def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, + index=0) def test_valid_elements(self): - errors = self.validator.get_errors(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), - Column('', allow_empty=True)) - self.assertEqual(len(errors), 0) + errors = get_warnings( + self.validator, + pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category') + ) + assert len(list(errors)) == 0 def test_invalid_empty_elements(self): - errors = self.validator.get_errors(pd.Series(['aa', 'bb', 'd', None], dtype='category'), - Column('', allow_empty=False)) - self.assertEqual(len(errors), 4) + errors = get_warnings( + self.validator, + pd.Series(['aa', 'bb', 'd', None], dtype='category') + ) + assert len(list(errors)) == 4 def test_invalid_and_empty_elements(self): - errors = self.validator.get_errors(pd.Series(['a', None], dtype='category'), - Column('', allow_empty=False)) - self.assertEqual(len(errors), 1) + errors = get_warnings( + self.validator, + pd.Series(['a', None], dtype='category') + ) + assert len(list(errors)) == 1 def test_invalid_elements(self): - errors = self.validator.get_errors(pd.Series(['aa', 'bb', 'd'], dtype='category'), - Column('', allow_empty=True)) - self.assertEqual(len(errors), 3) + errors = get_warnings( + self.validator, + pd.Series(['aa', 'bb', 'd'], dtype='category') + ) + assert len(list(errors)) == 3 diff --git a/test/test_validation_warning.py b/test/test_validation_warning.py old mode 100644 new mode 100755 diff --git a/test/util.py b/test/util.py new file mode 100644 index 0000000..bab2b77 --- /dev/null +++ b/test/util.py @@ -0,0 +1,32 @@ +import pandas as pd +from pandas_schema.core import BaseValidation +from pandas_schema.validation_warning import ValidationWarning +import typing + + +def get_warnings(validator: BaseValidation, series: typing.Union[list, pd.Series]) -> typing.Collection[ + ValidationWarning]: + """ + Tests a validator by asserting that it generates the amount of warnings + :param series_dtype: Explicitly specifies the dtype for the generated Series + :param series: The series to check + :param expected_result: Whether the elements in this series should pass the validation + :param msg: The message to display if this test fails + """ + + # # Check that self.validator is correct + # if not self.validator or not isinstance(self.validator, BooleanSeriesValidation, index=0): + # raise ValueError('The class must have the validator field set to an instance of a Validation subclass') + # + # # Ensure we're comparing series correctly + # self.addTypeEqualityFunc(pd.Series, self.seriesEquality) + + df = pd.Series(series).to_frame() + warnings = validator.validate(df) + return list(warnings) + # + # # Now find any items where their validation does not correspond to the expected_result + # for item, result in zip(series, results): + # with self.subTest(value=item): + # self.assertEqual(result, expected_result, msg) +