Skip to content

Commit

Permalink
Merge pull request #18 from farridav/master
Browse files Browse the repository at this point in the history
Fix Schema Validation when using DtypeValidator
  • Loading branch information
multimeric authored Apr 28, 2019
2 parents 4f64baa + 18bc8ef commit 2886b77
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 14 deletions.
6 changes: 4 additions & 2 deletions pandas_schema/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,10 @@ def __init__(self, dtype: np.dtype, **kwargs):
def get_errors(self, series: pd.Series, column: 'column.Column' = None):
if not np.issubdtype(series.dtype, self.dtype):
return [ValidationWarning(
'The column has a dtype of {} which is not a subclass of the required type {}'.format(series.dtype,
self.dtype))]
'The column {} has a dtype of {} which is not a subclass of the required type {}'.format(
column.name if column else '', series.dtype, self.dtype
)
)]
else:
return []

Expand Down
2 changes: 1 addition & 1 deletion pandas_schema/validation_warning.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ class ValidationWarning:
Represents a difference between the schema and data frame, found during the validation of the data frame
"""

def __init__(self, message: str, value: str = None, row: int = None, column: str = None):
def __init__(self, message: str, value: str = None, row: int = -1, column: str = None):
self.message = message
self.value = value
"""The value of the failing cell in the DataFrame"""
Expand Down
18 changes: 9 additions & 9 deletions test/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from io import StringIO
import unittest
import pandas as pd
from numpy.core.multiarray import dtype

from pandas_schema import Schema, Column
from pandas_schema.validation import LeadingWhitespaceValidation
from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation
from pandas_schema.errors import PanSchArgumentError

class UnorderedSchema(unittest.TestCase):
Expand Down Expand Up @@ -34,17 +35,17 @@ def test_validate_invalid(self):

def test_mixed_columns(self):
"""
Tests that when ordered=False, the schema columns are
Tests that when ordered=False, the schema columns are
associated with data frame columns by name, not position.
In this case, the schema's column order is [a, b], while
the data frame's order is [b, a]. There is an error in
column b in the data frame (leading whitespace), and a
column b in the data frame (leading whitespace), and a
validation on column b in the schema.
Schema a b (validation)
Data Frame b (error) a
Thus there will only be an error if column b in the schema
Thus there will only be an error if column b in the schema
is linked to column b in the data frame, as is correct
behaviour.
"""
Expand Down Expand Up @@ -72,7 +73,7 @@ def test_column_subset_detect(self):
column* is not being passed
Thus there will only be an error if column b in the schema
is linked to column b in the data frame, as is correct
is linked to column b in the data frame, as is correct
behaviour
"""

Expand All @@ -89,7 +90,6 @@ def test_column_subset_detect(self):
self.assertEqual(results[0].row, 0)
self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name')


def test_column_subset_detect_empty(self):
"""
Tests that when ordered=False, validation is possible by
Expand All @@ -102,7 +102,7 @@ def test_column_subset_detect_empty(self):
There will be an error if other than zero errors are found.
"""

df = pd.read_csv(StringIO('''
b,a
1,1
Expand All @@ -120,7 +120,7 @@ def test_column_subset_error(self):
passing a subset of the columns contained in the schema
Schema a b (validation)
Data Frame b (error) a
Data Frame b (error) a
There will be an error if a column different than 'a' or 'b' is passed
"""
Expand All @@ -131,7 +131,7 @@ def test_column_subset_error(self):
2,3
3,3
'''), sep=',', header=0, dtype=str)

# should raise a PanSchArgumentError
self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c'])

Expand Down
34 changes: 32 additions & 2 deletions test/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import unittest
import re

from numpy import nan
from numpy import nan, dtype

from pandas_schema import Column
from pandas_schema import Column, Schema
from pandas_schema.validation import _BaseValidation
from pandas_schema.validation import *
from pandas_schema import ValidationWarning
Expand Down Expand Up @@ -493,6 +493,36 @@ def test_invalid_items(self):
self.assertEqual(type(errors[0]), ValidationWarning)


def test_schema(self):
"""
Test this validation inside a schema, to ensure we get helpful error messages.
In particular, we want to make sure that a ValidationWarning without a row number won't break the schema
"""
df = pd.DataFrame(data={
'wrong_dtype1': ['not_an_int'],
'wrong_dtype2': [123],
'wrong_dtype3': [12.5]
})

schema = Schema([
Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
])

errors = schema.validate(df)

self.assertEqual(
sorted([str(x) for x in errors]),
sorted([
'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
])
)



class Negate(ValidationTestBase):
"""
Tests the ~ operator on a MatchesPatternValidation
Expand Down

0 comments on commit 2886b77

Please sign in to comment.