-
Notifications
You must be signed in to change notification settings - Fork 36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Column info for invalid column number #8
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,11 +45,51 @@ def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing | |
schema_cols = len(self.columns) | ||
columns_to_pair = self.columns | ||
if df_cols != schema_cols: | ||
|
||
schema_columns = set(self.get_column_names()) | ||
df_columns = set(df.columns) | ||
|
||
add_schema_columns = [col for col in schema_columns if col not in df_columns] | ||
add_df_columns = [col for col in df_columns if col not in schema_columns] | ||
|
||
if not add_schema_columns: | ||
|
||
errors.append( | ||
ValidationWarning( | ||
'Invalid number of columns. The schema specifies {n_schema}, ' | ||
'but the data frame has {n_df}. ' | ||
'The additional data frame columns are: {add_columns}.'.format( | ||
n_schema=schema_cols, | ||
n_df=df_cols, | ||
add_columns=add_df_columns, | ||
) | ||
) | ||
) | ||
return errors | ||
|
||
if not add_df_columns: | ||
errors.append( | ||
ValidationWarning( | ||
'Invalid number of columns. The schema specifies {n_schema}, ' | ||
'but the data frame has {n_df}. The additional schema columns are: {add_columns}.'.format( | ||
n_schema=schema_cols, | ||
n_df=df_cols, | ||
add_columns=add_schema_columns, | ||
) | ||
) | ||
) | ||
return errors | ||
|
||
errors.append( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you have a second |
||
ValidationWarning( | ||
'Invalid number of columns. The schema specifies {}, but the data frame has {}'.format( | ||
schema_cols, | ||
df_cols) | ||
'Invalid number of columns. The schema specifies {n_schema}, ' | ||
'but the data frame has {n_df}. The additional schema columns are: {add_columns_1} ' | ||
'and the additional data frame columns are: {add_columns_2}.'.format( | ||
n_schema=schema_cols, | ||
n_df=df_cols, | ||
add_columns_1=add_schema_columns, | ||
add_columns_2=add_df_columns, | ||
) | ||
) | ||
) | ||
return errors | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -135,6 +135,13 @@ def test_column_subset_error(self): | |
# should raise a PanSchArgumentError | ||
self.assertRaises(PanSchArgumentError, self.schema.validate, df, columns=['c']) | ||
|
||
def test_column_not_present_shown(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a docstring that explains this test (can be one sentence) |
||
|
||
df = pd.DataFrame.from_dict({'a': [1, 2, 3]}) | ||
|
||
out = self.schema.validate(df, columns=['a', 'b']) | ||
assert out[0].message == 'The column b exists in the schema but not in the data frame' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be a |
||
|
||
|
||
class OrderedSchema(unittest.TestCase): | ||
schema = Schema([ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use set operations here, (https://docs.python.org/3/library/stdtypes.html#set) e.g.
add_schema_columns = schema_columns - df_columns