diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 9d7a3ba..2334bb6 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -226,8 +226,10 @@ def __init__(self, dtype: np.dtype, **kwargs): def get_errors(self, series: pd.Series, column: 'column.Column' = None): if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( - 'The column has a dtype of {} which is not a subclass of the required type {}'.format(series.dtype, - self.dtype))] + 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( + column.name if column else '', series.dtype, self.dtype + ) + )] else: return [] diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 087200c..320be65 100644 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -3,7 +3,7 @@ class ValidationWarning: Represents a difference between the schema and data frame, found during the validation of the data frame """ - def __init__(self, message: str, value: str = None, row: int = None, column: str = None): + def __init__(self, message: str, value: str = None, row: int = -1, column: str = None): self.message = message self.value = value """The value of the failing cell in the DataFrame""" diff --git a/test/test_schema.py b/test/test_schema.py index 18cb4ea..a02cfcd 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -1,9 +1,10 @@ from io import StringIO import unittest import pandas as pd +from numpy.core.multiarray import dtype from pandas_schema import Schema, Column -from pandas_schema.validation import LeadingWhitespaceValidation +from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation from pandas_schema.errors import PanSchArgumentError class UnorderedSchema(unittest.TestCase): @@ -34,17 +35,17 @@ def test_validate_invalid(self): def test_mixed_columns(self): """ - Tests that when ordered=False, the schema columns are + Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in - column b in the data frame (leading whitespace), and a + column b in the data frame (leading whitespace), and a validation on column b in the schema. Schema a b (validation) Data Frame b (error) a - Thus there will only be an error if column b in the schema + Thus there will only be an error if column b in the schema is linked to column b in the data frame, as is correct behaviour. """ @@ -72,7 +73,7 @@ def test_column_subset_detect(self): column* is not being passed Thus there will only be an error if column b in the schema - is linked to column b in the data frame, as is correct + is linked to column b in the data frame, as is correct behaviour """ @@ -89,7 +90,6 @@ def test_column_subset_detect(self): self.assertEqual(results[0].row, 0) self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name') - def test_column_subset_detect_empty(self): """ Tests that when ordered=False, validation is possible by @@ -102,7 +102,7 @@ def test_column_subset_detect_empty(self): There will be an error if other than zero errors are found. """ - + df = pd.read_csv(StringIO(''' b,a 1,1 @@ -120,7 +120,7 @@ def test_column_subset_error(self): passing a subset of the columns contained in the schema Schema a b (validation) - Data Frame b (error) a + Data Frame b (error) a There will be an error if a column different than 'a' or 'b' is passed """ @@ -131,7 +131,7 @@ def test_column_subset_error(self): 2,3 3,3 '''), sep=',', header=0, dtype=str) - + # should raise a PanSchArgumentError self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c']) diff --git a/test/test_validation.py b/test/test_validation.py index 41a0161..3e8f2bb 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -2,9 +2,9 @@ import unittest import re -from numpy import nan +from numpy import nan, dtype -from pandas_schema import Column +from pandas_schema import Column, Schema from pandas_schema.validation import _BaseValidation from pandas_schema.validation import * from pandas_schema import ValidationWarning @@ -493,6 +493,36 @@ def test_invalid_items(self): self.assertEqual(type(errors[0]), ValidationWarning) + def test_schema(self): + """ + Test this validation inside a schema, to ensure we get helpful error messages. + In particular, we want to make sure that a ValidationWarning without a row number won't break the schema + """ + df = pd.DataFrame(data={ + 'wrong_dtype1': ['not_an_int'], + 'wrong_dtype2': [123], + 'wrong_dtype3': [12.5] + }) + + schema = Schema([ + Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), + Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), + Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), + ]) + + errors = schema.validate(df) + + self.assertEqual( + sorted([str(x) for x in errors]), + sorted([ + 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', + 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', + 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' + ]) + ) + + + class Negate(ValidationTestBase): """ Tests the ~ operator on a MatchesPatternValidation