From 17cc50da74f672422b2cd3169fd7eea6f7c05f2d Mon Sep 17 00:00:00 2001 From: David Farrington Date: Tue, 11 Sep 2018 09:46:30 +0100 Subject: [PATCH 1/3] Fixes dtype validation, + existing failing test, + adds optional column name to dtype validation error --- example/example.txt | 2 +- pandas_schema/schema.py | 2 +- pandas_schema/validation.py | 6 +++-- test/test_schema.py | 45 +++++++++++++++++++++++++++++-------- 4 files changed, 42 insertions(+), 13 deletions(-) diff --git a/example/example.txt b/example/example.txt index f0397b9..7eca31e 100644 --- a/example/example.txt +++ b/example/example.txt @@ -1,5 +1,5 @@ {row: 0, column: "Given Name"}: "Gerald " contains trailing whitespace {row: 1, column: "Age"}: "270" was not in the range [0, 120) -{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male, Female, Other) +{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male,Female,Other) {row: 2, column: "Family Name"}: "Majewska " contains trailing whitespace {row: 2, column: "Customer ID"}: "775ANSID" does not match the pattern "\d{4}[A-Z]{4}" diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 5c0442e..e8e500f 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -84,7 +84,7 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing for series, column in column_pairs: errors += column.validate(series) - return sorted(errors, key=lambda e: e.row) + return sorted(errors, key=lambda e: e.row or 0) def get_column_names(self): """ diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index f3cfab3..6f6232d 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -226,8 +226,10 @@ def __init__(self, dtype: np.dtype, **kwargs): def get_errors(self, series: pd.Series, column: 'column.Column' = None): if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( - 'The column has a dtype of {} which is not a subclass of the required type {}'.format(series.dtype, - self.dtype))] + 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( + column.name if column else '', series.dtype, self.dtype + ) + )] else: return [] diff --git a/test/test_schema.py b/test/test_schema.py index 18cb4ea..d97c69e 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -1,9 +1,10 @@ from io import StringIO import unittest import pandas as pd +from numpy.core.multiarray import dtype from pandas_schema import Schema, Column -from pandas_schema.validation import LeadingWhitespaceValidation +from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation from pandas_schema.errors import PanSchArgumentError class UnorderedSchema(unittest.TestCase): @@ -34,17 +35,17 @@ def test_validate_invalid(self): def test_mixed_columns(self): """ - Tests that when ordered=False, the schema columns are + Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in - column b in the data frame (leading whitespace), and a + column b in the data frame (leading whitespace), and a validation on column b in the schema. Schema a b (validation) Data Frame b (error) a - Thus there will only be an error if column b in the schema + Thus there will only be an error if column b in the schema is linked to column b in the data frame, as is correct behaviour. """ @@ -72,7 +73,7 @@ def test_column_subset_detect(self): column* is not being passed Thus there will only be an error if column b in the schema - is linked to column b in the data frame, as is correct + is linked to column b in the data frame, as is correct behaviour """ @@ -89,7 +90,33 @@ def test_column_subset_detect(self): self.assertEqual(results[0].row, 0) self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name') - + def test_dtype_validation(self): + """ + Using a schema with dtype validation, we can validate, and get contextual error messages + """ + df = pd.DataFrame(data={ + 'wrong_dtype1': ['not_an_int'], + 'wrong_dtype2': [123], + 'wrong_dtype3': [12.5] + }) + + schema = Schema([ + Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), + Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), + Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), + ]) + + errors = schema.validate(df) + + self.assertEqual( + sorted([str(x) for x in errors]), + sorted([ + 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', + 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', + 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' + ]) + ) + def test_column_subset_detect_empty(self): """ Tests that when ordered=False, validation is possible by @@ -102,7 +129,7 @@ def test_column_subset_detect_empty(self): There will be an error if other than zero errors are found. """ - + df = pd.read_csv(StringIO(''' b,a 1,1 @@ -120,7 +147,7 @@ def test_column_subset_error(self): passing a subset of the columns contained in the schema Schema a b (validation) - Data Frame b (error) a + Data Frame b (error) a There will be an error if a column different than 'a' or 'b' is passed """ @@ -131,7 +158,7 @@ def test_column_subset_error(self): 2,3 3,3 '''), sep=',', header=0, dtype=str) - + # should raise a PanSchArgumentError self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c']) From f2d65e43c5b95166f41d4296e2b337e6f0396ba5 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 28 Apr 2019 21:49:37 +1000 Subject: [PATCH 2/3] Moved test to the right place, made default row index -1, added some comments --- pandas_schema/schema.py | 2 +- pandas_schema/validation_warning.py | 2 +- test/test_schema.py | 27 ----------------------- test/test_validation.py | 34 +++++++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 31 deletions(-) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index e8e500f..5c0442e 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -84,7 +84,7 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing for series, column in column_pairs: errors += column.validate(series) - return sorted(errors, key=lambda e: e.row or 0) + return sorted(errors, key=lambda e: e.row) def get_column_names(self): """ diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 087200c..320be65 100644 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -3,7 +3,7 @@ class ValidationWarning: Represents a difference between the schema and data frame, found during the validation of the data frame """ - def __init__(self, message: str, value: str = None, row: int = None, column: str = None): + def __init__(self, message: str, value: str = None, row: int = -1, column: str = None): self.message = message self.value = value """The value of the failing cell in the DataFrame""" diff --git a/test/test_schema.py b/test/test_schema.py index d97c69e..a02cfcd 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -90,33 +90,6 @@ def test_column_subset_detect(self): self.assertEqual(results[0].row, 0) self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name') - def test_dtype_validation(self): - """ - Using a schema with dtype validation, we can validate, and get contextual error messages - """ - df = pd.DataFrame(data={ - 'wrong_dtype1': ['not_an_int'], - 'wrong_dtype2': [123], - 'wrong_dtype3': [12.5] - }) - - schema = Schema([ - Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), - Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), - Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), - ]) - - errors = schema.validate(df) - - self.assertEqual( - sorted([str(x) for x in errors]), - sorted([ - 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', - 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', - 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' - ]) - ) - def test_column_subset_detect_empty(self): """ Tests that when ordered=False, validation is possible by diff --git a/test/test_validation.py b/test/test_validation.py index 41a0161..3e8f2bb 100644 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -2,9 +2,9 @@ import unittest import re -from numpy import nan +from numpy import nan, dtype -from pandas_schema import Column +from pandas_schema import Column, Schema from pandas_schema.validation import _BaseValidation from pandas_schema.validation import * from pandas_schema import ValidationWarning @@ -493,6 +493,36 @@ def test_invalid_items(self): self.assertEqual(type(errors[0]), ValidationWarning) + def test_schema(self): + """ + Test this validation inside a schema, to ensure we get helpful error messages. + In particular, we want to make sure that a ValidationWarning without a row number won't break the schema + """ + df = pd.DataFrame(data={ + 'wrong_dtype1': ['not_an_int'], + 'wrong_dtype2': [123], + 'wrong_dtype3': [12.5] + }) + + schema = Schema([ + Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), + Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), + Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), + ]) + + errors = schema.validate(df) + + self.assertEqual( + sorted([str(x) for x in errors]), + sorted([ + 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', + 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', + 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' + ]) + ) + + + class Negate(ValidationTestBase): """ Tests the ~ operator on a MatchesPatternValidation From 18bc8ef6654327f076429f7cdade2f747d7c27ff Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 28 Apr 2019 21:54:08 +1000 Subject: [PATCH 3/3] Undo example.txt edit (this is already fixed by #20) --- example/example.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/example.txt b/example/example.txt index 7eca31e..f0397b9 100644 --- a/example/example.txt +++ b/example/example.txt @@ -1,5 +1,5 @@ {row: 0, column: "Given Name"}: "Gerald " contains trailing whitespace {row: 1, column: "Age"}: "270" was not in the range [0, 120) -{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male,Female,Other) +{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male, Female, Other) {row: 2, column: "Family Name"}: "Majewska " contains trailing whitespace {row: 2, column: "Customer ID"}: "775ANSID" does not match the pattern "\d{4}[A-Z]{4}"