From 2f7e7f3d3295bb0c9e3069b6afe2dc2a37b167a1 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 18 Nov 2019 01:24:46 +1100 Subject: [PATCH] Initial groundwork for the rewrite --- pandas_schema/column.py | 63 ++++-- pandas_schema/errors.py | 6 + pandas_schema/schema.py | 2 +- pandas_schema/validation.py | 411 +++--------------------------------- 4 files changed, 85 insertions(+), 397 deletions(-) diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 199b883..cec4153 100644 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -4,24 +4,51 @@ from . import validation from .validation_warning import ValidationWarning -class Column: - def __init__(self, name: str, validations: typing.Iterable['validation._BaseValidation'] = [], allow_empty=False): - """ - Creates a new Column object - :param name: The column header that defines this column. This must be identical to the header used in the CSV/Data Frame you are validating. - :param validations: An iterable of objects implementing _BaseValidation that will generate ValidationErrors - :param allow_empty: True if an empty column is considered valid. False if we leave that logic up to the Validation - """ - self.name = name - self.validations = list(validations) - self.allow_empty = allow_empty +def _column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: typing.Union[int, str] = None, + position: bool = False +): + """ + A utility method for setting the index data on a set of Validations + :param validations: A list of validations to modify + :param index: The index of the series that these validations will now consider + :param position: If true, these validations use positional indexing. + See :py:class:`pandas_schema.validation.IndexSeriesValidation` + """ + for valid in validations: + valid.index = index + valid.position = position - def validate(self, series: pd.Series) -> typing.List[ValidationWarning]: - """ - Creates a list of validation errors using the Validation objects contained in the Column - :param series: A pandas Series to validate - :return: An iterable of ValidationError instances generated by the validation - """ - return [error for validation in self.validations for error in validation.get_errors(series, self)] +def label_column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: typing.Union[int, str], +): + """ + A utility method for setting the label-based column for each validation + :param validations: A list of validations to modify + :param index: The label of the series that these validations will now consider + """ + return _column( + validations, + index, + position=False + ) + + +def positional_column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: int, +): + """ + A utility method for setting the position-based column for each validation + :param validations: A list of validations to modify + :param index: The index of the series that these validations will now consider + """ + return _column( + validations, + index, + position=True + ) diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py index a9176bf..ab5e73d 100644 --- a/pandas_schema/errors.py +++ b/pandas_schema/errors.py @@ -10,6 +10,12 @@ class PanSchInvalidSchemaError(PanSchError): """ +class PanSchNoIndexError(PanSchInvalidSchemaError): + """ + A validation was provided that has not specified an index + """ + + class PanSchArgumentError(PanSchError): """ An argument passed to a function has an invalid type or value diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 5c0442e..13d8158 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -11,7 +11,7 @@ class Schema: A schema that defines the columns required in the target DataFrame """ - def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): + def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): """ :param columns: A list of column objects :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 2a3f2f8..9343d7b 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -8,412 +8,67 @@ from . import column from .validation_warning import ValidationWarning -from .errors import PanSchArgumentError +from .errors import PanSchArgumentError, PanSchNoIndexError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class _BaseValidation: - """ - The validation base class that defines any object that can create a list of errors from a Series - """ - __metaclass__ = abc.ABCMeta - +class _BaseValidation(abc.ABC): @abc.abstractmethod - def get_errors(self, series: pd.Series, column: 'column.Column') -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ - Return a list of errors in the given series - :param series: - :param column: - :return: + Validates a data frame + :param df: Data frame to validate + :return: All validation failures detected by this validation """ class _SeriesValidation(_BaseValidation): """ - Implements the _BaseValidation interface by returning a Boolean series for each element that either passes or - fails the validation + A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation + to it """ - __metaclass__ = abc.ABCMeta - - def __init__(self, **kwargs): - self._custom_message = kwargs.get('message') - - @property - def message(self): - return self._custom_message or self.default_message - - @abc.abstractproperty - def default_message(self) -> str: - """ - Create a message to be displayed whenever this validation fails - This should be a generic message for the validation type, but can be overwritten if the user provides a - message kwarg - """ @abc.abstractmethod - def validate(self, series: pd.Series) -> pd.Series: - """ - Returns a Boolean series, where each value of False is an element in the Series that has failed the validation - :param series: - :return: - """ - - def __invert__(self): - """ - Returns a negated version of this validation - """ - return _InverseValidation(self) - - def __or__(self, other: '_SeriesValidation'): - """ - Returns a validation which is true if either this or the other validation is true - """ - return _CombinedValidation(self, other, operator.or_) - - def __and__(self, other: '_SeriesValidation'): + def select_series(self, df: pd.DataFrame) -> pd.Series: """ - Returns a validation which is true if either this or the other validation is true + Selects a series from the DataFrame that will be validated """ - return _CombinedValidation(self, other, operator.and_) - - def get_errors(self, series: pd.Series, column: 'column.Column'): - - errors = [] - - # Calculate which columns are valid using the child class's validate function, skipping empty entries if the - # column specifies to do so - simple_validation = ~self.validate(series) - if column.allow_empty: - # Failing results are those that are not empty, and fail the validation - # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is - if is_categorical_dtype(series) or is_numeric_dtype(series): - validated = ~series.isnull() & simple_validation - else: - validated = (series.str.len() > 0) & simple_validation - - else: - validated = simple_validation - - # Cut down the original series to only ones that failed the validation - indices = series.index[validated] - - # Use these indices to find the failing items. Also print the index which is probably a row number - for i in indices: - element = series[i] - errors.append(ValidationWarning( - message=self.message, - value=element, - row=i, - column=series.name - )) - - return errors - - -class _InverseValidation(_SeriesValidation): - """ - Negates an ElementValidation - """ - - def __init__(self, validation: _SeriesValidation): - self.negated = validation - super().__init__() - - def validate(self, series: pd.Series): - return ~ self.negated.validate(series) - - @property - def default_message(self): - return self.negated.message + ' ' - - -class _CombinedValidation(_SeriesValidation): - """ - Validates if one and/or the other validation is true for an element - """ - - def __init__(self, validation_a: _SeriesValidation, validation_b: _SeriesValidation, operator): - self.operator = operator - self.v_a = validation_a - self.v_b = validation_b - super().__init__() - - def validate(self, series: pd.Series): - return self.operator(self.v_a.validate(series), self.v_b.validate(series)) - - @property - def default_message(self): - return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) - - -class CustomSeriesValidation(_SeriesValidation): - """ - Validates using a user-provided function that operates on an entire series (for example by using one of the pandas - Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) - """ - - def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): - """ - :param message: The error message to provide to the user if this validation fails. The row and column and - failing value will automatically be prepended to this message, so you only have to provide a message that - describes what went wrong, for example 'failed my validation' will become - - {row: 1, column: "Column Name"}: "Value" failed my validation - :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal - to True if the object passed validation, and False if it failed - """ - self._validation = validation - super().__init__(message=message) - - def validate(self, series: pd.Series) -> pd.Series: - return self._validation(series) - - -class CustomElementValidation(_SeriesValidation): - """ - Validates using a user-provided function that operates on each element - """ - - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): - """ - :param message: The error message to provide to the user if this validation fails. The row and column and - failing value will automatically be prepended to this message, so you only have to provide a message that - describes what went wrong, for example 'failed my validation' will become - - {row: 1, column: "Column Name"}: "Value" failed my validation - :param validation: A function that takes the value of a data frame cell and returns True if it passes the - the validation, and false if it doesn't - """ - self._validation = validation - super().__init__(message=message) - - def validate(self, series: pd.Series) -> pd.Series: - return series.apply(self._validation) - - -class InRangeValidation(_SeriesValidation): - """ - Checks that each element in the series is within a given numerical range - """ - - def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): - """ - :param min: The minimum (inclusive) value to accept - :param max: The maximum (exclusive) value to accept - """ - self.min = min - self.max = max - super().__init__(**kwargs) - - @property - def default_message(self): - return 'was not in the range [{}, {})'.format(self.min, self.max) - - def validate(self, series: pd.Series) -> pd.Series: - series = pd.to_numeric(series) - return (series >= self.min) & (series < self.max) - -class IsDtypeValidation(_BaseValidation): - """ - Checks that a series has a certain numpy dtype - """ - - def __init__(self, dtype: np.dtype, **kwargs): - """ - :param dtype: The numpy dtype to check the column against - """ - self.dtype = dtype - super().__init__(**kwargs) - - def get_errors(self, series: pd.Series, column: 'column.Column' = None): - if not np.issubdtype(series.dtype, self.dtype): - return [ValidationWarning( - 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype - ) - )] - else: - return [] - - -class CanCallValidation(_SeriesValidation): - """ - Validates if a given function can be called on each element in a column without raising an exception - """ - - def __init__(self, func: typing.Callable, **kwargs): - """ - :param func: A python function that will be called with the value of each cell in the DataFrame. If this - function throws an error, this cell is considered to have failed the validation. Otherwise it has passed. - """ - if callable(type): - self.callable = func - else: - raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) - super().__init__(**kwargs) - - @property - def default_message(self): - return 'raised an exception when the callable {} was called on it'.format(self.callable) - - def can_call(self, var): - try: - self.callable(var) - return True - except: - return False - - def validate(self, series: pd.Series) -> pd.Series: - return series.apply(self.can_call) - - -class CanConvertValidation(CanCallValidation): - """ - Checks if each element in a column can be converted to a Python object type - """ - - """ - Internally this uses the same logic as CanCallValidation since all types are callable in python. - However this class overrides the error messages to make them more directed towards types - """ - - def __init__(self, _type: type, **kwargs): + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: """ - :param _type: Any python type. Its constructor will be called with the value of the individual cell as its - only argument. If it throws an exception, the value is considered to fail the validation, otherwise it has passed + Validate a single series """ - if isinstance(_type, type): - super(CanConvertValidation, self).__init__(_type, **kwargs) - else: - raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): - return 'cannot be converted to type {}'.format(self.callable) + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + series = self.select_series(df) + return self.validate_series(series) -class MatchesPatternValidation(_SeriesValidation): +class IndexSeriesValidation(_SeriesValidation): """ - Validates that a string or regular expression can match somewhere in each element in this column + Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation + or later """ - def __init__(self, pattern, options={}, **kwargs): + def __init__(self, index: typing.Union[int, str] = None, position: bool = False): """ - :param kwargs: Arguments to pass to Series.str.contains - (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html) - pat is the only required argument + Creates a new IndexSeriesValidation + :param index: An index with which to select the series + :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). + Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.pattern = pattern - self.options = options - super().__init__(**kwargs) - - @property - def default_message(self): - return 'does not match the pattern "{}"'.format(self.pattern) - - def validate(self, series: pd.Series) -> pd.Series: - return series.astype(str).str.contains(self.pattern, **self.options) - - -class TrailingWhitespaceValidation(_SeriesValidation): - """ - Checks that there is no trailing whitespace in this column - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains trailing whitespace' - - def validate(self, series: pd.Series) -> pd.Series: - return ~series.astype(str).str.contains('\s+$') - - -class LeadingWhitespaceValidation(_SeriesValidation): - """ - Checks that there is no leading whitespace in this column - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains leading whitespace' - - def validate(self, series: pd.Series) -> pd.Series: - return ~series.astype(str).str.contains('^\s+') - - -class IsDistinctValidation(_SeriesValidation): - """ - Checks that every element of this column is different from each other element - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains values that are not unique' + self.index = column + self.position = position - def validate(self, series: pd.Series) -> pd.Series: - return ~series.duplicated(keep='first') - - -class InListValidation(_SeriesValidation): - """ - Checks that each element in this column is contained within a list of possibilities - """ - - def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwargs): + def select_series(self, df: pd.DataFrame) -> pd.Series: """ - :param options: A list of values to check. If the value of a cell is in this list, it is considered to pass the - validation + Select a series using the data stored in this validation """ - self.case_sensitive = case_sensitive - self.options = options - super().__init__(**kwargs) - - @property - def default_message(self): - values = ', '.join(str(v) for v in self.options) - return 'is not in the list of legal options ({})'.format(values) + if self.index is None: + raise PanSchNoIndexError() - def validate(self, series: pd.Series) -> pd.Series: - if self.case_sensitive: - return series.isin(self.options) + if self.position: + return df.iloc[self.index] else: - return series.str.lower().isin([s.lower() for s in self.options]) - - -class DateFormatValidation(_SeriesValidation): - """ - Checks that each element in this column is a valid date according to a provided format string - """ - - def __init__(self, date_format: str, **kwargs): - """ - :param date_format: The date format string to validate the column against. Refer to the date format code - documentation at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for a full - list of format codes - """ - self.date_format = date_format - super().__init__(**kwargs) - - @property - def default_message(self): - return 'does not match the date format string "{}"'.format(self.date_format) - - def valid_date(self, val): - try: - datetime.datetime.strptime(val, self.date_format) - return True - except: - return False - - def validate(self, series: pd.Series) -> pd.Series: - return series.astype(str).apply(self.valid_date) + return df.loc[self.index]