From 17cc50da74f672422b2cd3169fd7eea6f7c05f2d Mon Sep 17 00:00:00 2001
From: David Farrington <info@davidfarrington.co.uk>
Date: Tue, 11 Sep 2018 09:46:30 +0100
Subject: [PATCH 1/3] Fixes dtype validation, + existing failing test, + adds
 optional column name to dtype validation error

---
 example/example.txt         |  2 +-
 pandas_schema/schema.py     |  2 +-
 pandas_schema/validation.py |  6 +++--
 test/test_schema.py         | 45 +++++++++++++++++++++++++++++--------
 4 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/example/example.txt b/example/example.txt
index f0397b9..7eca31e 100644
--- a/example/example.txt
+++ b/example/example.txt
@@ -1,5 +1,5 @@
 {row: 0, column: "Given Name"}: "Gerald " contains trailing whitespace
 {row: 1, column: "Age"}: "270" was not in the range [0, 120)
-{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male, Female, Other)
+{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male,Female,Other)
 {row: 2, column: "Family Name"}: "Majewska " contains trailing whitespace
 {row: 2, column: "Customer ID"}: "775ANSID" does not match the pattern "\d{4}[A-Z]{4}"
diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py
index 5c0442e..e8e500f 100644
--- a/pandas_schema/schema.py
+++ b/pandas_schema/schema.py
@@ -84,7 +84,7 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing
         for series, column in column_pairs:
             errors += column.validate(series)
 
-        return sorted(errors, key=lambda e: e.row)
+        return sorted(errors, key=lambda e: e.row or 0)
 
     def get_column_names(self):
         """
diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py
index f3cfab3..6f6232d 100644
--- a/pandas_schema/validation.py
+++ b/pandas_schema/validation.py
@@ -226,8 +226,10 @@ def __init__(self, dtype: np.dtype, **kwargs):
     def get_errors(self, series: pd.Series, column: 'column.Column' = None):
         if not np.issubdtype(series.dtype, self.dtype):
             return [ValidationWarning(
-                'The column has a dtype of {} which is not a subclass of the required type {}'.format(series.dtype,
-                                                                                                      self.dtype))]
+                'The column {} has a dtype of {} which is not a subclass of the required type {}'.format(
+                    column.name if column else '', series.dtype, self.dtype
+                )
+            )]
         else:
             return []
 
diff --git a/test/test_schema.py b/test/test_schema.py
index 18cb4ea..d97c69e 100644
--- a/test/test_schema.py
+++ b/test/test_schema.py
@@ -1,9 +1,10 @@
 from io import StringIO
 import unittest
 import pandas as pd
+from numpy.core.multiarray import dtype
 
 from pandas_schema import Schema, Column
-from pandas_schema.validation import LeadingWhitespaceValidation
+from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation
 from pandas_schema.errors import PanSchArgumentError
 
 class UnorderedSchema(unittest.TestCase):
@@ -34,17 +35,17 @@ def test_validate_invalid(self):
 
     def test_mixed_columns(self):
         """
-        Tests that when ordered=False, the schema columns are 
+        Tests that when ordered=False, the schema columns are
         associated with data frame columns by name, not position.
         In this case, the schema's column order is [a, b], while
          the data frame's order is [b, a]. There is an error in
-        column b in the data frame (leading whitespace), and a 
+        column b in the data frame (leading whitespace), and a
         validation on column b in the schema.
 
         Schema         a                b (validation)
         Data Frame     b (error)        a
 
-        Thus there will only be an error if column b in the schema 
+        Thus there will only be an error if column b in the schema
         is linked to column b in the data frame, as is correct
         behaviour.
         """
@@ -72,7 +73,7 @@ def test_column_subset_detect(self):
         column* is not being passed
 
         Thus there will only be an error if column b in the schema
-        is linked to column b in the data frame, as is correct 
+        is linked to column b in the data frame, as is correct
         behaviour
         """
 
@@ -89,7 +90,33 @@ def test_column_subset_detect(self):
         self.assertEqual(results[0].row, 0)
         self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name')
 
-        
+    def test_dtype_validation(self):
+        """
+        Using a schema with dtype validation, we can validate, and get contextual error messages
+        """
+        df = pd.DataFrame(data={
+            'wrong_dtype1': ['not_an_int'],
+            'wrong_dtype2': [123],
+            'wrong_dtype3': [12.5]
+        })
+
+        schema = Schema([
+            Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
+            Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
+            Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
+        ])
+
+        errors = schema.validate(df)
+
+        self.assertEqual(
+            sorted([str(x) for x in errors]),
+            sorted([
+                'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
+                'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
+                'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
+            ])
+        )
+
     def test_column_subset_detect_empty(self):
         """
         Tests that when ordered=False, validation is possible by
@@ -102,7 +129,7 @@ def test_column_subset_detect_empty(self):
 
         There will be an error if other than zero errors are found.
         """
-        
+
         df = pd.read_csv(StringIO('''
 b,a
  1,1
@@ -120,7 +147,7 @@ def test_column_subset_error(self):
         passing a subset of the columns contained in the schema
 
         Schema         a                b (validation)
-        Data Frame     b (error)        a 
+        Data Frame     b (error)        a
 
         There will be an error if a column different than 'a' or 'b' is passed
         """
@@ -131,7 +158,7 @@ def test_column_subset_error(self):
 2,3
 3,3
         '''), sep=',', header=0, dtype=str)
-        
+
         # should raise a PanSchArgumentError
         self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c'])
 

From f2d65e43c5b95166f41d4296e2b337e6f0396ba5 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Sun, 28 Apr 2019 21:49:37 +1000
Subject: [PATCH 2/3] Moved test to the right place, made default row index -1,
 added some comments

---
 pandas_schema/schema.py             |  2 +-
 pandas_schema/validation_warning.py |  2 +-
 test/test_schema.py                 | 27 -----------------------
 test/test_validation.py             | 34 +++++++++++++++++++++++++++--
 4 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py
index e8e500f..5c0442e 100644
--- a/pandas_schema/schema.py
+++ b/pandas_schema/schema.py
@@ -84,7 +84,7 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing
         for series, column in column_pairs:
             errors += column.validate(series)
 
-        return sorted(errors, key=lambda e: e.row or 0)
+        return sorted(errors, key=lambda e: e.row)
 
     def get_column_names(self):
         """
diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py
index 087200c..320be65 100644
--- a/pandas_schema/validation_warning.py
+++ b/pandas_schema/validation_warning.py
@@ -3,7 +3,7 @@ class ValidationWarning:
     Represents a difference between the schema and data frame, found during the validation of the data frame
     """
 
-    def __init__(self, message: str, value: str = None, row: int = None, column: str = None):
+    def __init__(self, message: str, value: str = None, row: int = -1, column: str = None):
         self.message = message
         self.value = value
         """The value of the failing cell in the DataFrame"""
diff --git a/test/test_schema.py b/test/test_schema.py
index d97c69e..a02cfcd 100644
--- a/test/test_schema.py
+++ b/test/test_schema.py
@@ -90,33 +90,6 @@ def test_column_subset_detect(self):
         self.assertEqual(results[0].row, 0)
         self.assertEqual(results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name')
 
-    def test_dtype_validation(self):
-        """
-        Using a schema with dtype validation, we can validate, and get contextual error messages
-        """
-        df = pd.DataFrame(data={
-            'wrong_dtype1': ['not_an_int'],
-            'wrong_dtype2': [123],
-            'wrong_dtype3': [12.5]
-        })
-
-        schema = Schema([
-            Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
-            Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
-            Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
-        ])
-
-        errors = schema.validate(df)
-
-        self.assertEqual(
-            sorted([str(x) for x in errors]),
-            sorted([
-                'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
-                'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
-                'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
-            ])
-        )
-
     def test_column_subset_detect_empty(self):
         """
         Tests that when ordered=False, validation is possible by
diff --git a/test/test_validation.py b/test/test_validation.py
index 41a0161..3e8f2bb 100644
--- a/test/test_validation.py
+++ b/test/test_validation.py
@@ -2,9 +2,9 @@
 import unittest
 import re
 
-from numpy import nan
+from numpy import nan, dtype
 
-from pandas_schema import Column
+from pandas_schema import Column, Schema
 from pandas_schema.validation import _BaseValidation
 from pandas_schema.validation import *
 from pandas_schema import ValidationWarning
@@ -493,6 +493,36 @@ def test_invalid_items(self):
         self.assertEqual(type(errors[0]), ValidationWarning)
 
 
+    def test_schema(self):
+        """
+        Test this validation inside a schema, to ensure we get helpful error messages.
+        In particular, we want to make sure that a ValidationWarning without a row number won't break the schema
+        """
+        df = pd.DataFrame(data={
+            'wrong_dtype1': ['not_an_int'],
+            'wrong_dtype2': [123],
+            'wrong_dtype3': [12.5]
+        })
+
+        schema = Schema([
+            Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
+            Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
+            Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
+        ])
+
+        errors = schema.validate(df)
+
+        self.assertEqual(
+            sorted([str(x) for x in errors]),
+            sorted([
+                'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
+                'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
+                'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
+            ])
+        )
+
+
+
 class Negate(ValidationTestBase):
     """
     Tests the ~ operator on a MatchesPatternValidation

From 18bc8ef6654327f076429f7cdade2f747d7c27ff Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Sun, 28 Apr 2019 21:54:08 +1000
Subject: [PATCH 3/3] Undo example.txt edit (this is already fixed by #20)

---
 example/example.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/example.txt b/example/example.txt
index 7eca31e..f0397b9 100644
--- a/example/example.txt
+++ b/example/example.txt
@@ -1,5 +1,5 @@
 {row: 0, column: "Given Name"}: "Gerald " contains trailing whitespace
 {row: 1, column: "Age"}: "270" was not in the range [0, 120)
-{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male,Female,Other)
+{row: 1, column: "Sex"}: "male" is not in the list of legal options (Male, Female, Other)
 {row: 2, column: "Family Name"}: "Majewska " contains trailing whitespace
 {row: 2, column: "Customer ID"}: "775ANSID" does not match the pattern "\d{4}[A-Z]{4}"