Merge pull request #18 from farridav/master

Fix Schema Validation when using DtypeValidator
multimeric · Apr 28, 2019 · 2886b77 · 2886b77
2 parents 4f64baa + 18bc8ef
commit 2886b77
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 14 deletions.
diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py
@@ -226,8 +226,10 @@ def __init__(self, dtype: np.dtype, **kwargs):
     def get_errors(self, series: pd.Series, column: 'column.Column' = None):
         if not np.issubdtype(series.dtype, self.dtype):
             return [ValidationWarning(
-                'The column has a dtype of {} which is not a subclass of the required type {}'.format(series.dtype,
-                                                                                                      self.dtype))]
+                'The column {} has a dtype of {} which is not a subclass of the required type {}'.format(
+                    column.name if column else '', series.dtype, self.dtype
+                )
+            )]
         else:
             return []
 

diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py
@@ -3,7 +3,7 @@ class ValidationWarning:
     Represents a difference between the schema and data frame, found during the validation of the data frame
     """
 
-    def __init__(self, message: str, value: str = None, row: int = None, column: str = None):
+    def __init__(self, message: str, value: str = None, row: int = -1, column: str = None):
         self.message = message
         self.value = value
         """The value of the failing cell in the DataFrame"""

diff --git a/test/test_schema.py b/test/test_schema.py
@@ -1,9 +1,10 @@
 from io import StringIO
 import unittest
 import pandas as pd
+from numpy.core.multiarray import dtype
 
 from pandas_schema import Schema, Column
-from pandas_schema.validation import LeadingWhitespaceValidation
+from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation
 from pandas_schema.errors import PanSchArgumentError
 
 class UnorderedSchema(unittest.TestCase):
@@ -34,17 +35,17 @@ def test_validate_invalid(self):
 
     def test_mixed_columns(self):
         """
-        Tests that when ordered=False, the schema columns are 
+        Tests that when ordered=False, the schema columns are
         associated with data frame columns by name, not position.
         In this case, the schema's column order is [a, b], while
          the data frame's order is [b, a]. There is an error in
-        column b in the data frame (leading whitespace), and a 
+        column b in the data frame (leading whitespace), and a
         validation on column b in the schema.
 
         Schema         a                b (validation)
         Data Frame     b (error)        a
 
-        Thus there will only be an error if column b in the schema 
+        Thus there will only be an error if column b in the schema
         is linked to column b in the data frame, as is correct
         behaviour.
         """
@@ -72,7 +73,7 @@ def test_column_subset_detect(self):
         column* is not being passed
 
         Thus there will only be an error if column b in the schema
-        is linked to column b in the data frame, as is correct 
+        is linked to column b in the data frame, as is correct
         behaviour
         """
 
@@ -89,7 +90,6 @@ def test_column_subset_detect(self):
         self.assertEqual(results[0].row, 0)
         self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name')
 
-
     def test_column_subset_detect_empty(self):
         """
         Tests that when ordered=False, validation is possible by
@@ -102,7 +102,7 @@ def test_column_subset_detect_empty(self):
 
         There will be an error if other than zero errors are found.
         """
-        
+
         df = pd.read_csv(StringIO('''
 b,a
  1,1
@@ -120,7 +120,7 @@ def test_column_subset_error(self):
         passing a subset of the columns contained in the schema
 
         Schema         a                b (validation)
-        Data Frame     b (error)        a 
+        Data Frame     b (error)        a
 
         There will be an error if a column different than 'a' or 'b' is passed
         """
@@ -131,7 +131,7 @@ def test_column_subset_error(self):
 2,3
 3,3
         '''), sep=',', header=0, dtype=str)
-        
+
         # should raise a PanSchArgumentError
         self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c'])
 

diff --git a/test/test_validation.py b/test/test_validation.py
@@ -2,9 +2,9 @@
 import unittest
 import re
 
-from numpy import nan
+from numpy import nan, dtype
 
-from pandas_schema import Column
+from pandas_schema import Column, Schema
 from pandas_schema.validation import _BaseValidation
 from pandas_schema.validation import *
 from pandas_schema import ValidationWarning
@@ -493,6 +493,36 @@ def test_invalid_items(self):
         self.assertEqual(type(errors[0]), ValidationWarning)
 
 
+    def test_schema(self):
+        """
+        Test this validation inside a schema, to ensure we get helpful error messages.
+        In particular, we want to make sure that a ValidationWarning without a row number won't break the schema
+        """
+        df = pd.DataFrame(data={
+            'wrong_dtype1': ['not_an_int'],
+            'wrong_dtype2': [123],
+            'wrong_dtype3': [12.5]
+        })
+
+        schema = Schema([
+            Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
+            Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
+            Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
+        ])
+
+        errors = schema.validate(df)
+
+        self.assertEqual(
+            sorted([str(x) for x in errors]),
+            sorted([
+                'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
+                'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
+                'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
+            ])
+        )
+
+
+
 class Negate(ValidationTestBase):
     """
     Tests the ~ operator on a MatchesPatternValidation