diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 5c0442e..a14a227 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -45,11 +45,51 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing schema_cols = len(self.columns) columns_to_pair = self.columns if df_cols != schema_cols: + + schema_columns = set(self.get_column_names()) + df_columns = set(df.columns) + + add_schema_columns = [col for col in schema_columns if col not in df_columns] + add_df_columns = [col for col in df_columns if col not in schema_columns] + + if not add_schema_columns: + + errors.append( + ValidationWarning( + 'Invalid number of columns. The schema specifies {n_schema}, ' + 'but the data frame has {n_df}. ' + 'The additional data frame columns are: {add_columns}.'.format( + n_schema=schema_cols, + n_df=df_cols, + add_columns=add_df_columns, + ) + ) + ) + return errors + + if not add_df_columns: + errors.append( + ValidationWarning( + 'Invalid number of columns. The schema specifies {n_schema}, ' + 'but the data frame has {n_df}. The additional schema columns are: {add_columns}.'.format( + n_schema=schema_cols, + n_df=df_cols, + add_columns=add_schema_columns, + ) + ) + ) + return errors + errors.append( ValidationWarning( - 'Invalid number of columns. The schema specifies {}, but the data frame has {}'.format( - schema_cols, - df_cols) + 'Invalid number of columns. The schema specifies {n_schema}, ' + 'but the data frame has {n_df}. The additional schema columns are: {add_columns_1} ' + 'and the additional data frame columns are: {add_columns_2}.'.format( + n_schema=schema_cols, + n_df=df_cols, + add_columns_1=add_schema_columns, + add_columns_2=add_df_columns, + ) ) ) return errors diff --git a/test/test_schema.py b/test/test_schema.py index 18cb4ea..e87042a 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -135,6 +135,13 @@ def test_column_subset_error(self): # should raise a PanSchArgumentError self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c']) + def test_column_not_present_shown(self): + + df = pd.DataFrame.from_dict({'a': [1, 2, 3]}) + + out = self.schema.validate(df, columns=['a', 'b']) + assert out[0].message == 'The column b exists in the schema but not in the data frame' + class OrderedSchema(unittest.TestCase): schema = Schema([