From 58d441569f8334e51cfc11d7f6876b0fdce86db9 Mon Sep 17 00:00:00 2001 From: Oliver Furtmaier Date: Mon, 12 Mar 2018 10:24:07 +0100 Subject: [PATCH 1/3] show column names for schema and data frame in case column numbers do not match --- pandas_schema/schema.py | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 5c0442e..4fcaecf 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -45,11 +45,51 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing schema_cols = len(self.columns) columns_to_pair = self.columns if df_cols != schema_cols: + + schema_columns = self.get_column_names() + df_columns = df.columns + + add_schema_columns = [col for col in schema_columns if col not in df_columns] + add_df_columns = [col for col in df_columns if col not in schema_columns] + + if not add_schema_columns: + + errors.append( + ValidationWarning( + 'Invalid number of columns. The schema specifies {n_schema}, ' + 'but the data frame has {n_df}. ' + 'The additional data frame columns are: {add_columns}.'.format( + n_schema=schema_cols, + n_df=df_cols, + add_columns=add_schema_columns, + ) + ) + ) + return errors + + if not add_df_columns: + errors.append( + ValidationWarning( + 'Invalid number of columns. The schema specifies {n_schema}, ' + 'but the data frame has {n_df}. The additional schema columns are: {add_columns}.'.format( + n_schema=schema_cols, + n_df=df_cols, + add_columns=add_df_columns, + ) + ) + ) + return errors + errors.append( ValidationWarning( - 'Invalid number of columns. The schema specifies {}, but the data frame has {}'.format( - schema_cols, - df_cols) + 'Invalid number of columns. The schema specifies {n_schema}, ' + 'but the data frame has {n_df}. The additional schema columns are: {add_columns_1} ' + 'and the additional data frame columns are: {add_columns_2}.'.format( + n_schema=schema_cols, + n_df=df_cols, + add_columns_1=add_schema_columns, + add_columns_2=add_df_columns, + ) ) ) return errors From 1f93728e3ab479b6ef9449a77777ee473713bcdb Mon Sep 17 00:00:00 2001 From: Oliver Furtmaier Date: Mon, 12 Mar 2018 10:45:10 +0100 Subject: [PATCH 2/3] fix wrong show for add_schema_ and add_df_columns in if-statement --- pandas_schema/schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 4fcaecf..e3f17ea 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -61,7 +61,7 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing 'The additional data frame columns are: {add_columns}.'.format( n_schema=schema_cols, n_df=df_cols, - add_columns=add_schema_columns, + add_columns=add_df_columns, ) ) ) @@ -74,7 +74,7 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing 'but the data frame has {n_df}. The additional schema columns are: {add_columns}.'.format( n_schema=schema_cols, n_df=df_cols, - add_columns=add_df_columns, + add_columns=add_schema_columns, ) ) ) From b0aaab7ad3f52cb714888736e59d268813702490 Mon Sep 17 00:00:00 2001 From: Oliver Furtmaier Date: Fri, 18 Jan 2019 10:31:36 +0100 Subject: [PATCH 3/3] added set to column list and unit test which demonstrates column names are shown in output message of validation warning --- pandas_schema/schema.py | 4 ++-- test/test_schema.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index e3f17ea..a14a227 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -46,8 +46,8 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing columns_to_pair = self.columns if df_cols != schema_cols: - schema_columns = self.get_column_names() - df_columns = df.columns + schema_columns = set(self.get_column_names()) + df_columns = set(df.columns) add_schema_columns = [col for col in schema_columns if col not in df_columns] add_df_columns = [col for col in df_columns if col not in schema_columns] diff --git a/test/test_schema.py b/test/test_schema.py index 18cb4ea..e87042a 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -135,6 +135,13 @@ def test_column_subset_error(self): # should raise a PanSchArgumentError self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c']) + def test_column_not_present_shown(self): + + df = pd.DataFrame.from_dict({'a': [1, 2, 3]}) + + out = self.schema.validate(df, columns=['a', 'b']) + assert out[0].message == 'The column b exists in the schema but not in the data frame' + class OrderedSchema(unittest.TestCase): schema = Schema([