From 1469a910fb2f24b127945747487c6188d6149a4f Mon Sep 17 00:00:00 2001 From: Eneko Martin Martinez Date: Tue, 6 Jul 2021 09:52:38 +0200 Subject: [PATCH 1/3] Improve benchmarking tools --- pysd/tools/benchmarking.py | 73 +++++++++++++++++---- tests/unit_test_benchmarking.py | 110 +++++++++++++++++++++++++++++++- tests/unit_test_pysd.py | 2 +- 3 files changed, 170 insertions(+), 15 deletions(-) diff --git a/pysd/tools/benchmarking.py b/pysd/tools/benchmarking.py index a61f93e7..abaaf5ea 100644 --- a/pysd/tools/benchmarking.py +++ b/pysd/tools/benchmarking.py @@ -48,7 +48,9 @@ def runner(model_file, canonical_file=None, transpose=False): else: raise FileNotFoundError('\nCanonical output file not found.') - canon = load_outputs(canonical_file, transpose) + canon = load_outputs(canonical_file, + transpose=transpose, + encoding=detect_encoding(canonical_file)) # load model if model_file.lower().endswith('.mdl'): @@ -63,7 +65,7 @@ def runner(model_file, canonical_file=None, transpose=False): return model.run(return_columns=canon.columns), canon -def load_outputs(file_name, transpose=False): +def load_outputs(file_name, transpose=False, columns=None, encoding=None): """ Load outputs file @@ -76,6 +78,16 @@ def load_outputs(file_name, transpose=False): If True reads transposed outputs file, i.e. one variable per row. Default is False. + columns: list or None (optional) + List of the column names to load. If None loads all the columns. + Default is None. Note, if transpose=False, the loading will be + faster as only selected columns will be loaded. If transpose=True + the whole file must be read and it will be subselected later. + + encoding: str or None (optional) + Encoding type to read output file. Needed if the file has special + characters. Default is None. + Returns ------- pandas.DataFrame @@ -84,19 +96,28 @@ def load_outputs(file_name, transpose=False): """ read_func = {'.csv': pd.read_csv, '.tab': pd.read_table} + if columns: + columns = set(columns) + if not transpose: + columns.add("Time") + for end, func in read_func.items(): if file_name.lower().endswith(end): if transpose: out = func(file_name, - encoding=_detect_encoding(file_name), + encoding=encoding, index_col=0).T + if columns: + out = out[columns] else: out = func(file_name, - encoding=_detect_encoding(file_name), - index_col='Time') + encoding=encoding, + usecols=columns, + index_col="Time") out.index = out.index.astype(float) - return out + # return the dataframe removing nan index values + return out[~np.isnan(out.index)] raise ValueError( f"\nNot able to read '{file_name}'. " @@ -178,18 +199,44 @@ def assert_frames_close(actual, expected, assertion="raise", http://nbviewer.jupyter.org/gist/jiffyclub/ac2e7506428d5e1d587b """ - assert (isinstance(actual, pd.DataFrame) and - isinstance(expected, pd.DataFrame)), \ - 'Inputs must both be pandas DataFrames.' + if not isinstance(actual, pd.DataFrame)\ + or not isinstance(expected, pd.DataFrame): + raise TypeError('\nInputs must both be pandas DataFrames.') + + expected_cols, actual_cols = set(expected.columns), set(actual.columns) + + if expected_cols != actual_cols: + # columns are not equal + message = "" + + if actual_cols.difference(expected_cols): + columns = ["'" + col + "'" for col + in actual_cols.difference(expected_cols)] + columns = ", ".join(columns) + message += '\nColumns ' + columns\ + + ' from actual values not found in expected values.' + + if expected_cols.difference(actual_cols): + columns = ["'" + col + "'" for col + in expected_cols.difference(actual_cols)] + columns = ", ".join(columns) + message += '\nColumns ' + columns\ + + ' from expected values not found in actual values.' + + if assertion == "raise": + raise ValueError( + '\nColumns from actual and expected values must be equal.' + + message) + else: + warnings.warn(message) - assert set(expected.columns) == set(actual.columns), \ - 'test set columns must be equal to those in actual/observed set.' + columns = actual_cols.intersection(expected_cols) assert np.all(np.equal(expected.index.values, actual.index.values)), \ 'test set and actual set must share a common index' \ 'instead found' + expected.index.values + 'vs' + actual.index.values - for col in expected.columns: + for col in columns: # if for Vensim outputs where constant values are only in the first row if np.isnan(expected[col].values[1:]).all(): expected[col] = expected[col].values[0] @@ -245,7 +292,7 @@ def assert_allclose(x, y, rtol=1.e-5, atol=1.e-5): assert np.all(np.less_equal(abs(x - y), atol + rtol * abs(y))) -def _detect_encoding(filename): +def detect_encoding(filename): """ Detects the encoding of a file. diff --git a/tests/unit_test_benchmarking.py b/tests/unit_test_benchmarking.py index 106b5fe4..47cab816 100644 --- a/tests/unit_test_benchmarking.py +++ b/tests/unit_test_benchmarking.py @@ -87,4 +87,112 @@ def test_transposed_frame(self): assert_frames_close( load_outputs('data/out_teacup.csv'), - load_outputs('data/out_teacup_transposed.csv', transpose=True)) \ No newline at end of file + load_outputs('data/out_teacup_transposed.csv', transpose=True)) + + def test_load_columns(self): + from pysd.tools.benchmarking import load_outputs + + out0 = load_outputs( + 'data/out_teacup.csv') + + out1 = load_outputs( + 'data/out_teacup.csv', + columns=["Room Temperature", "Teacup Temperature"]) + + out2 = load_outputs( + 'data/out_teacup_transposed.csv', + transpose=True, + columns=["Heat Loss to Room"]) + + self.assertEqual( + set(out1.columns), + set(["Room Temperature", "Teacup Temperature"])) + + self.assertEqual( + set(out2.columns), + set(["Heat Loss to Room"])) + + self.assertTrue((out0.index == out1.index).all()) + self.assertTrue((out0.index == out2.index).all()) + + def test_different_cols(self): + from warnings import catch_warnings + from pysd.tools.benchmarking import assert_frames_close + import pandas as pd + + d1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'd': [6, 7]}) + d2 = pd.DataFrame({'a': [1, 2]}) + d3 = pd.DataFrame({'a': [1, 2], 'c': [3, 4]}) + + with self.assertRaises(ValueError) as err: + assert_frames_close( + actual=d1, + expected=d2) + + self.assertIn( + "Columns from actual and expected values must be equal.", + str(err.exception)) + + with catch_warnings(record=True) as ws: + assert_frames_close( + actual=d1, + expected=d2, + assertion="warn") + + # use only user warnings + wu = [w for w in ws if issubclass(w.category, UserWarning)] + self.assertEqual(len(wu), 1) + + self.assertIn("'b'", str(wu[0].message)) + self.assertIn("'d'", str(wu[0].message)) + self.assertIn( + "from actual values not found in expected values.", + str(wu[0].message)) + + with catch_warnings(record=True) as ws: + assert_frames_close( + expected=d1, + actual=d2, + assertion="warn") + + # use only user warnings + wu = [w for w in ws if issubclass(w.category, UserWarning)] + self.assertEqual(len(wu), 1) + + self.assertIn("'b'", str(wu[0].message)) + self.assertIn("'d'", str(wu[0].message)) + self.assertIn( + "from expected values not found in actual values.", + str(wu[0].message)) + + with catch_warnings(record=True) as ws: + assert_frames_close( + actual=d1, + expected=d3, + assertion="warn") + + # use only user warnings + wu = [w for w in ws if issubclass(w.category, UserWarning)] + self.assertEqual(len(wu), 1) + + self.assertIn("'b'", str(wu[0].message)) + self.assertIn("'d'", str(wu[0].message)) + self.assertIn( + "from actual values not found in expected values.", + str(wu[0].message)) + + self.assertIn( + "Columns 'c' from expected values not found in actual " + "values.", str(wu[0].message)) + + def test_invalid_input(self): + from pysd.tools.benchmarking import assert_frames_close + + with self.assertRaises(TypeError) as err: + assert_frames_close( + actual=[1, 2], + expected=[1, 2]) + + self.assertIn( + "Inputs must both be pandas DataFrames.", + str(err.exception)) diff --git a/tests/unit_test_pysd.py b/tests/unit_test_pysd.py index 78ef1403..cd3dad16 100644 --- a/tests/unit_test_pysd.py +++ b/tests/unit_test_pysd.py @@ -1653,7 +1653,7 @@ def test_py_model_file(self): import pysd model = pysd.read_vensim(test_model) - self.assertEqual(model.py_model_file, + self.assertEqual(model.py_model_file, test_model.replace(".mdl", ".py")) def test_mdl_file(self): From a186cdfb0a16349e7f27bdb551072a414e9a3f43 Mon Sep 17 00:00:00 2001 From: Eneko Martin Martinez Date: Wed, 7 Jul 2021 14:41:06 +0200 Subject: [PATCH 2/3] Remove nan values from constant variables of actual --- pysd/tools/benchmarking.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pysd/tools/benchmarking.py b/pysd/tools/benchmarking.py index abaaf5ea..24ced225 100644 --- a/pysd/tools/benchmarking.py +++ b/pysd/tools/benchmarking.py @@ -80,9 +80,10 @@ def load_outputs(file_name, transpose=False, columns=None, encoding=None): columns: list or None (optional) List of the column names to load. If None loads all the columns. - Default is None. Note, if transpose=False, the loading will be - faster as only selected columns will be loaded. If transpose=True - the whole file must be read and it will be subselected later. + Default is None. + NOTE: if transpose=False, the loading will be faster as only + selected columns will be loaded. If transpose=True the whole + file must be read and it will be subselected later. encoding: str or None (optional) Encoding type to read output file. Needed if the file has special @@ -240,6 +241,8 @@ def assert_frames_close(actual, expected, assertion="raise", # if for Vensim outputs where constant values are only in the first row if np.isnan(expected[col].values[1:]).all(): expected[col] = expected[col].values[0] + if np.isnan(actual[col].values[1:]).all(): + actual[col] = actual[col].values[0] try: assert_allclose(expected[col].values, actual[col].values, From d3d7ea0209e6379fd0fdc28a1578a873898ac008 Mon Sep 17 00:00:00 2001 From: Eneko Martin Martinez Date: Thu, 8 Jul 2021 12:59:09 +0200 Subject: [PATCH 3/3] Add verbose option and vectorization --- pysd/tools/benchmarking.py | 84 +++++++++++++++++++++------------ tests/unit_test_benchmarking.py | 56 ++++++++++++++++++++++ 2 files changed, 111 insertions(+), 29 deletions(-) diff --git a/pysd/tools/benchmarking.py b/pysd/tools/benchmarking.py index 24ced225..acd0ce29 100644 --- a/pysd/tools/benchmarking.py +++ b/pysd/tools/benchmarking.py @@ -126,7 +126,7 @@ def load_outputs(file_name, transpose=False, columns=None, encoding=None): def assert_frames_close(actual, expected, assertion="raise", - precision=2, **kwargs): + verbose=False, precision=2, **kwargs): """ Compare DataFrame items by column and raise AssertionError if any column is not equal. @@ -147,8 +147,13 @@ def assert_frames_close(actual, expected, assertion="raise", that two frames are close. Otherwise, it will show a warning message. Default is "raise". + verbose: bool (optional) + If True, if any column is not close the actual and expected values + will be printed in the error/warning message with the difference. + Default is False. + precision: int (optional) - Precision to print the numerical values of assertion message. + Precision to print the numerical values of assertion verbosed message. Default is 2. kwargs: @@ -172,6 +177,18 @@ def assert_frames_close(actual, expected, assertion="raise", Traceback (most recent call last): ... AssertionError: + Following columns are not close: + \t'0' + + >>> assert_frames_close( + ... pd.DataFrame(100, index=range(5), columns=range(3)), + ... pd.DataFrame(150, index=range(5), columns=range(3)), + ... verbose=True, rtol=.2) # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + AssertionError: + Following columns are not close: + \t'0' Column '0' is not close. Expected values: \t[150, 150, 150, 150, 150] @@ -186,13 +203,8 @@ def assert_frames_close(actual, expected, assertion="raise", ... rtol=.2, assertion="warn") ... UserWarning: - Column '0' is not close. - Expected values: - \t[150, 150, 150, 150, 150] - Actual values: - \t[100, 100, 100, 100, 100] - Difference: - \t[50, 50, 50, 50, 50] + Following columns are not close: + \t'0' References ---------- @@ -231,25 +243,31 @@ def assert_frames_close(actual, expected, assertion="raise", else: warnings.warn(message) - columns = actual_cols.intersection(expected_cols) + columns = list(actual_cols.intersection(expected_cols)) + # TODO let compare dataframes with different timestamps if "warn" assert np.all(np.equal(expected.index.values, actual.index.values)), \ 'test set and actual set must share a common index' \ 'instead found' + expected.index.values + 'vs' + actual.index.values - for col in columns: - # if for Vensim outputs where constant values are only in the first row - if np.isnan(expected[col].values[1:]).all(): - expected[col] = expected[col].values[0] - if np.isnan(actual[col].values[1:]).all(): - actual[col] = actual[col].values[0] - try: - assert_allclose(expected[col].values, - actual[col].values, - **kwargs) - - except AssertionError: - assertion_details = '\n\n'\ + # if for Vensim outputs where constant values are only in the first row + _remove_constant_nan(expected) + _remove_constant_nan(actual) + + c = assert_allclose(expected[columns], + actual[columns], + **kwargs) + + if c.all(): + return + + columns = np.array(columns, dtype=str)[~c.values] + + assertion_details = "\nFollowing columns are not close:\n\t"\ + + ", ".join(columns) + if verbose: + for col in columns: + assertion_details += '\n\n'\ + f"Column '{col}' is not close."\ + '\n\nExpected values:\n\t'\ + np.array2string(expected[col].values, @@ -264,12 +282,12 @@ def assert_frames_close(actual, expected, assertion="raise", + np.array2string(expected[col].values-actual[col].values, precision=precision, separator=', ', - suppress_small=True)\ + suppress_small=True) - if assertion == "raise": - raise AssertionError(assertion_details) - else: - warnings.warn(assertion_details) + if assertion == "raise": + raise AssertionError(assertion_details) + else: + warnings.warn(assertion_details) def assert_allclose(x, y, rtol=1.e-5, atol=1.e-5): @@ -292,7 +310,15 @@ def assert_allclose(x, y, rtol=1.e-5, atol=1.e-5): None """ - assert np.all(np.less_equal(abs(x - y), atol + rtol * abs(y))) + return (abs(x - y) <= atol + rtol * abs(y)).all() + + +def _remove_constant_nan(df): + """ + Removes nana values in constant value columns produced by Vensim + """ + nan_cols = np.isnan(df.iloc[1:, :]).all() + df.loc[:, nan_cols] = df.loc[:, nan_cols].iloc[0].values def detect_encoding(filename): diff --git a/tests/unit_test_benchmarking.py b/tests/unit_test_benchmarking.py index 47cab816..182edbd7 100644 --- a/tests/unit_test_benchmarking.py +++ b/tests/unit_test_benchmarking.py @@ -44,6 +44,32 @@ def test_different_frames_error(self): load_outputs('data/out_teacup.csv'), load_outputs('data/out_teacup_modified.csv')) + self.assertIn( + "Following columns are not close:\n\tTeacup Temperature", + str(err.exception)) + + self.assertNotIn( + "Column 'Teacup Temperature' is not close.", + str(err.exception)) + + self.assertNotIn( + "Actual values:\n\t", + str(err.exception)) + + self.assertNotIn( + "Expected values:\n\t", + str(err.exception)) + + with self.assertRaises(AssertionError) as err: + assert_frames_close( + load_outputs('data/out_teacup.csv'), + load_outputs('data/out_teacup_modified.csv'), + verbose=True) + + self.assertIn( + "Following columns are not close:\n\tTeacup Temperature", + str(err.exception)) + self.assertIn( "Column 'Teacup Temperature' is not close.", str(err.exception)) @@ -70,6 +96,36 @@ def test_different_frames_warning(self): wu = [w for w in ws if issubclass(w.category, UserWarning)] self.assertEqual(len(wu), 1) + self.assertIn( + "Following columns are not close:\n\tTeacup Temperature", + str(wu[0].message)) + + self.assertNotIn( + "Column 'Teacup Temperature' is not close.", + str(wu[0].message)) + + self.assertNotIn( + "Actual values:\n\t", + str(wu[0].message)) + + self.assertNotIn( + "Expected values:\n\t", + str(wu[0].message)) + + with catch_warnings(record=True) as ws: + assert_frames_close( + load_outputs('data/out_teacup.csv'), + load_outputs('data/out_teacup_modified.csv'), + assertion="warn", verbose=True) + + # use only user warnings + wu = [w for w in ws if issubclass(w.category, UserWarning)] + self.assertEqual(len(wu), 1) + + self.assertIn( + "Following columns are not close:\n\tTeacup Temperature", + str(wu[0].message)) + self.assertIn( "Column 'Teacup Temperature' is not close.", str(wu[0].message))