From 2f7e7f3d3295bb0c9e3069b6afe2dc2a37b167a1 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Mon, 18 Nov 2019 01:24:46 +1100
Subject: [PATCH] Initial groundwork for the rewrite

---
 pandas_schema/column.py     |  63 ++++--
 pandas_schema/errors.py     |   6 +
 pandas_schema/schema.py     |   2 +-
 pandas_schema/validation.py | 411 +++---------------------------------
 4 files changed, 85 insertions(+), 397 deletions(-)

diff --git a/pandas_schema/column.py b/pandas_schema/column.py
index 199b883..cec4153 100644
--- a/pandas_schema/column.py
+++ b/pandas_schema/column.py
@@ -4,24 +4,51 @@
 from . import validation
 from .validation_warning import ValidationWarning
 
-class Column:
-    def __init__(self, name: str, validations: typing.Iterable['validation._BaseValidation'] = [], allow_empty=False):
-        """
-        Creates a new Column object
 
-        :param name: The column header that defines this column. This must be identical to the header used in the CSV/Data Frame you are validating.
-        :param validations: An iterable of objects implementing _BaseValidation that will generate ValidationErrors
-        :param allow_empty: True if an empty column is considered valid. False if we leave that logic up to the Validation
-        """
-        self.name = name
-        self.validations = list(validations)
-        self.allow_empty = allow_empty
+def _column(
+        validations: typing.Iterable[validation.IndexSeriesValidation],
+        index: typing.Union[int, str] = None,
+        position: bool = False
+):
+    """
+    A utility method for setting the index data on a set of Validations
+    :param validations: A list of validations to modify
+    :param index: The index of the series that these validations will now consider
+    :param position: If true, these validations use positional indexing.
+    See :py:class:`pandas_schema.validation.IndexSeriesValidation`
+    """
+    for valid in validations:
+        valid.index = index
+        valid.position = position
 
-    def validate(self, series: pd.Series) -> typing.List[ValidationWarning]:
-        """
-        Creates a list of validation errors using the Validation objects contained in the Column
 
-        :param series: A pandas Series to validate
-        :return: An iterable of ValidationError instances generated by the validation
-        """
-        return [error for validation in self.validations for error in validation.get_errors(series, self)]
+def label_column(
+        validations: typing.Iterable[validation.IndexSeriesValidation],
+        index: typing.Union[int, str],
+):
+    """
+    A utility method for setting the label-based column for each validation
+    :param validations: A list of validations to modify
+    :param index: The label of the series that these validations will now consider
+    """
+    return _column(
+        validations,
+        index,
+        position=False
+    )
+
+
+def positional_column(
+        validations: typing.Iterable[validation.IndexSeriesValidation],
+        index: int,
+):
+    """
+    A utility method for setting the position-based column for each validation
+    :param validations: A list of validations to modify
+    :param index: The index of the series that these validations will now consider
+    """
+    return _column(
+        validations,
+        index,
+        position=True
+    )
diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py
index a9176bf..ab5e73d 100644
--- a/pandas_schema/errors.py
+++ b/pandas_schema/errors.py
@@ -10,6 +10,12 @@ class PanSchInvalidSchemaError(PanSchError):
     """
 
 
+class PanSchNoIndexError(PanSchInvalidSchemaError):
+    """
+    A validation was provided that has not specified an index
+    """
+
+
 class PanSchArgumentError(PanSchError):
     """
     An argument passed to a function has an invalid type or value
diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py
index 5c0442e..13d8158 100644
--- a/pandas_schema/schema.py
+++ b/pandas_schema/schema.py
@@ -11,7 +11,7 @@ class Schema:
     A schema that defines the columns required in the target DataFrame
     """
 
-    def __init__(self, columns: typing.Iterable[Column], ordered: bool = False):
+    def __init__(self, columns: typing.Iterable[Column],  ordered: bool = False):
         """
         :param columns: A list of column objects
         :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring
diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py
index 2a3f2f8..9343d7b 100644
--- a/pandas_schema/validation.py
+++ b/pandas_schema/validation.py
@@ -8,412 +8,67 @@
 
 from . import column
 from .validation_warning import ValidationWarning
-from .errors import PanSchArgumentError
+from .errors import PanSchArgumentError, PanSchNoIndexError
 from pandas.api.types import is_categorical_dtype, is_numeric_dtype
 
 
-class _BaseValidation:
-    """
-    The validation base class that defines any object that can create a list of errors from a Series
-    """
-    __metaclass__ = abc.ABCMeta
-
+class _BaseValidation(abc.ABC):
     @abc.abstractmethod
-    def get_errors(self, series: pd.Series, column: 'column.Column') -> typing.Iterable[ValidationWarning]:
+    def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]:
         """
-        Return a list of errors in the given series
-        :param series:
-        :param column:
-        :return:
+        Validates a data frame
+        :param df: Data frame to validate
+        :return: All validation failures detected by this validation
         """
 
 
 class _SeriesValidation(_BaseValidation):
     """
-    Implements the _BaseValidation interface by returning a Boolean series for each element that either passes or
-    fails the validation
+    A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation
+    to it
     """
-    __metaclass__ = abc.ABCMeta
-
-    def __init__(self, **kwargs):
-        self._custom_message = kwargs.get('message')
-
-    @property
-    def message(self):
-        return self._custom_message or self.default_message
-
-    @abc.abstractproperty
-    def default_message(self) -> str:
-        """
-        Create a message to be displayed whenever this validation fails
-        This should be a generic message for the validation type, but can be overwritten if the user provides a
-        message kwarg
-        """
 
     @abc.abstractmethod
-    def validate(self, series: pd.Series) -> pd.Series:
-        """
-        Returns a Boolean series, where each value of False is an element in the Series that has failed the validation
-        :param series:
-        :return:
-        """
-
-    def __invert__(self):
-        """
-        Returns a negated version of this validation
-        """
-        return _InverseValidation(self)
-
-    def __or__(self, other: '_SeriesValidation'):
-        """
-        Returns a validation which is true if either this or the other validation is true
-        """
-        return _CombinedValidation(self, other, operator.or_)
-
-    def __and__(self, other: '_SeriesValidation'):
+    def select_series(self, df: pd.DataFrame) -> pd.Series:
         """
-        Returns a validation which is true if either this or the other validation is true
+        Selects a series from the DataFrame that will be validated
         """
-        return _CombinedValidation(self, other, operator.and_)
-
-    def get_errors(self, series: pd.Series, column: 'column.Column'):
-
-        errors = []
-
-        # Calculate which columns are valid using the child class's validate function, skipping empty entries if the
-        # column specifies to do so
-        simple_validation = ~self.validate(series)
-        if column.allow_empty:
-            # Failing results are those that are not empty, and fail the validation
-            # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is
-            if is_categorical_dtype(series) or is_numeric_dtype(series):
-                validated = ~series.isnull() & simple_validation
-            else:
-                validated = (series.str.len() > 0) & simple_validation
-
-        else:
-            validated = simple_validation
-
-        # Cut down the original series to only ones that failed the validation
-        indices = series.index[validated]
-
-        # Use these indices to find the failing items. Also print the index which is probably a row number
-        for i in indices:
-            element = series[i]
-            errors.append(ValidationWarning(
-                message=self.message,
-                value=element,
-                row=i,
-                column=series.name
-            ))
-
-        return errors
-
-
-class _InverseValidation(_SeriesValidation):
-    """
-    Negates an ElementValidation
-    """
-
-    def __init__(self, validation: _SeriesValidation):
-        self.negated = validation
-        super().__init__()
-
-    def validate(self, series: pd.Series):
-        return ~ self.negated.validate(series)
-
-    @property
-    def default_message(self):
-        return self.negated.message + ' <negated>'
-
-
-class _CombinedValidation(_SeriesValidation):
-    """
-    Validates if one and/or the other validation is true for an element
-    """
-
-    def __init__(self, validation_a: _SeriesValidation, validation_b: _SeriesValidation, operator):
-        self.operator = operator
-        self.v_a = validation_a
-        self.v_b = validation_b
-        super().__init__()
-
-    def validate(self, series: pd.Series):
-        return self.operator(self.v_a.validate(series), self.v_b.validate(series))
-
-    @property
-    def default_message(self):
-        return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message)
-
-
-class CustomSeriesValidation(_SeriesValidation):
-    """
-    Validates using a user-provided function that operates on an entire series (for example by using one of the pandas
-    Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series)
-    """
-
-    def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str):
-        """
-        :param message: The error message to provide to the user if this validation fails. The row and column and
-            failing value will automatically be prepended to this message, so you only have to provide a message that
-            describes what went wrong, for example 'failed my validation' will become
-
-            {row: 1, column: "Column Name"}: "Value" failed my validation
-        :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal
-            to True if the object passed validation, and False if it failed
-        """
-        self._validation = validation
-        super().__init__(message=message)
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return self._validation(series)
-
-
-class CustomElementValidation(_SeriesValidation):
-    """
-    Validates using a user-provided function that operates on each element
-    """
-
-    def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str):
-        """
-        :param message: The error message to provide to the user if this validation fails. The row and column and
-            failing value will automatically be prepended to this message, so you only have to provide a message that
-            describes what went wrong, for example 'failed my validation' will become
-
-            {row: 1, column: "Column Name"}: "Value" failed my validation
-        :param validation: A function that takes the value of a data frame cell and returns True if it passes the
-            the validation, and false if it doesn't
-        """
-        self._validation = validation
-        super().__init__(message=message)
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return series.apply(self._validation)
-
-
-class InRangeValidation(_SeriesValidation):
-    """
-    Checks that each element in the series is within a given numerical range
-    """
-
-    def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs):
-        """
-        :param min: The minimum (inclusive) value to accept
-        :param max: The maximum (exclusive) value to accept
-        """
-        self.min = min
-        self.max = max
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'was not in the range [{}, {})'.format(self.min, self.max)
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        series = pd.to_numeric(series)
-        return (series >= self.min) & (series < self.max)
-
 
-class IsDtypeValidation(_BaseValidation):
-    """
-    Checks that a series has a certain numpy dtype
-    """
-
-    def __init__(self, dtype: np.dtype, **kwargs):
-        """
-        :param dtype: The numpy dtype to check the column against
-        """
-        self.dtype = dtype
-        super().__init__(**kwargs)
-
-    def get_errors(self, series: pd.Series, column: 'column.Column' = None):
-        if not np.issubdtype(series.dtype, self.dtype):
-            return [ValidationWarning(
-                'The column {} has a dtype of {} which is not a subclass of the required type {}'.format(
-                    column.name if column else '', series.dtype, self.dtype
-                )
-            )]
-        else:
-            return []
-
-
-class CanCallValidation(_SeriesValidation):
-    """
-    Validates if a given function can be called on each element in a column without raising an exception
-    """
-
-    def __init__(self, func: typing.Callable, **kwargs):
-        """
-        :param func: A python function that will be called with the value of each cell in the DataFrame. If this
-            function throws an error, this cell is considered to have failed the validation. Otherwise it has passed.
-        """
-        if callable(type):
-            self.callable = func
-        else:
-            raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type))
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'raised an exception when the callable {} was called on it'.format(self.callable)
-
-    def can_call(self, var):
-        try:
-            self.callable(var)
-            return True
-        except:
-            return False
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return series.apply(self.can_call)
-
-
-class CanConvertValidation(CanCallValidation):
-    """
-    Checks if each element in a column can be converted to a Python object type
-    """
-
-    """
-    Internally this uses the same logic as CanCallValidation since all types are callable in python.
-    However this class overrides the error messages to make them more directed towards types
-    """
-
-    def __init__(self, _type: type, **kwargs):
+    @abc.abstractmethod
+    def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]:
         """
-        :param _type: Any python type. Its constructor will be called with the value of the individual cell as its
-            only argument. If it throws an exception, the value is considered to fail the validation, otherwise it has passed
+        Validate a single series
         """
-        if isinstance(_type, type):
-            super(CanConvertValidation, self).__init__(_type, **kwargs)
-        else:
-            raise PanSchArgumentError('{} is not a valid type'.format(_type))
 
-    @property
-    def default_message(self):
-        return 'cannot be converted to type {}'.format(self.callable)
+    def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]:
+        series = self.select_series(df)
+        return self.validate_series(series)
 
 
-class MatchesPatternValidation(_SeriesValidation):
+class IndexSeriesValidation(_SeriesValidation):
     """
-    Validates that a string or regular expression can match somewhere in each element in this column
+    Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation
+    or later
     """
 
-    def __init__(self, pattern, options={}, **kwargs):
+    def __init__(self, index: typing.Union[int, str] = None, position: bool = False):
         """
-        :param kwargs: Arguments to pass to Series.str.contains
-            (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html)
-            pat is the only required argument
+        Creates a new IndexSeriesValidation
+        :param index: An index with which to select the series
+        :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element).
+        Otherwise it's a label (ie, index=0) indicates the column with the label of 0
         """
-        self.pattern = pattern
-        self.options = options
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'does not match the pattern "{}"'.format(self.pattern)
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return series.astype(str).str.contains(self.pattern, **self.options)
-
-
-class TrailingWhitespaceValidation(_SeriesValidation):
-    """
-    Checks that there is no trailing whitespace in this column
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'contains trailing whitespace'
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return ~series.astype(str).str.contains('\s+$')
-
-
-class LeadingWhitespaceValidation(_SeriesValidation):
-    """
-    Checks that there is no leading whitespace in this column
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'contains leading whitespace'
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return ~series.astype(str).str.contains('^\s+')
-
-
-class IsDistinctValidation(_SeriesValidation):
-    """
-    Checks that every element of this column is different from each other element
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'contains values that are not unique'
+        self.index = column
+        self.position = position
 
-    def validate(self, series: pd.Series) -> pd.Series:
-        return ~series.duplicated(keep='first')
-
-
-class InListValidation(_SeriesValidation):
-    """
-    Checks that each element in this column is contained within a list of possibilities
-    """
-
-    def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwargs):
+    def select_series(self, df: pd.DataFrame) -> pd.Series:
         """
-        :param options: A list of values to check. If the value of a cell is in this list, it is considered to pass the
-            validation
+        Select a series using the data stored in this validation
         """
-        self.case_sensitive = case_sensitive
-        self.options = options
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        values = ', '.join(str(v) for v in self.options)
-        return 'is not in the list of legal options ({})'.format(values)
+        if self.index is None:
+            raise PanSchNoIndexError()
 
-    def validate(self, series: pd.Series) -> pd.Series:
-        if self.case_sensitive:
-            return series.isin(self.options)
+        if self.position:
+            return df.iloc[self.index]
         else:
-            return series.str.lower().isin([s.lower() for s in self.options])
-
-
-class DateFormatValidation(_SeriesValidation):
-    """
-    Checks that each element in this column is a valid date according to a provided format string
-    """
-
-    def __init__(self, date_format: str, **kwargs):
-        """
-        :param date_format: The date format string to validate the column against. Refer to the date format code
-            documentation at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for a full
-            list of format codes
-        """
-        self.date_format = date_format
-        super().__init__(**kwargs)
-
-    @property
-    def default_message(self):
-        return 'does not match the date format string "{}"'.format(self.date_format)
-
-    def valid_date(self, val):
-        try:
-            datetime.datetime.strptime(val, self.date_format)
-            return True
-        except:
-            return False
-
-    def validate(self, series: pd.Series) -> pd.Series:
-        return series.astype(str).apply(self.valid_date)
+            return df.loc[self.index]