From 1469a910fb2f24b127945747487c6188d6149a4f Mon Sep 17 00:00:00 2001
From: Eneko Martin Martinez <eneko.martin.martinez@gmail.com>
Date: Tue, 6 Jul 2021 09:52:38 +0200
Subject: [PATCH 1/3] Improve benchmarking tools

---
 pysd/tools/benchmarking.py      |  73 +++++++++++++++++----
 tests/unit_test_benchmarking.py | 110 +++++++++++++++++++++++++++++++-
 tests/unit_test_pysd.py         |   2 +-
 3 files changed, 170 insertions(+), 15 deletions(-)

diff --git a/pysd/tools/benchmarking.py b/pysd/tools/benchmarking.py
index a61f93e7..abaaf5ea 100644
--- a/pysd/tools/benchmarking.py
+++ b/pysd/tools/benchmarking.py
@@ -48,7 +48,9 @@ def runner(model_file, canonical_file=None, transpose=False):
         else:
             raise FileNotFoundError('\nCanonical output file not found.')
 
-    canon = load_outputs(canonical_file, transpose)
+    canon = load_outputs(canonical_file,
+                         transpose=transpose,
+                         encoding=detect_encoding(canonical_file))
 
     # load model
     if model_file.lower().endswith('.mdl'):
@@ -63,7 +65,7 @@ def runner(model_file, canonical_file=None, transpose=False):
     return model.run(return_columns=canon.columns), canon
 
 
-def load_outputs(file_name, transpose=False):
+def load_outputs(file_name, transpose=False, columns=None, encoding=None):
     """
     Load outputs file
 
@@ -76,6 +78,16 @@ def load_outputs(file_name, transpose=False):
         If True reads transposed outputs file, i.e. one variable per row.
         Default is False.
 
+    columns: list or None (optional)
+        List of the column names to load. If None loads all the columns.
+        Default is None. Note, if transpose=False, the loading will be
+        faster as only selected columns will be loaded. If transpose=True
+        the whole file must be read and it will be subselected later.
+
+    encoding: str or None (optional)
+        Encoding type to read output file. Needed if the file has special
+        characters. Default is None.
+
     Returns
     -------
     pandas.DataFrame
@@ -84,19 +96,28 @@ def load_outputs(file_name, transpose=False):
     """
     read_func = {'.csv': pd.read_csv, '.tab': pd.read_table}
 
+    if columns:
+        columns = set(columns)
+        if not transpose:
+            columns.add("Time")
+
     for end, func in read_func.items():
         if file_name.lower().endswith(end):
             if transpose:
                 out = func(file_name,
-                           encoding=_detect_encoding(file_name),
+                           encoding=encoding,
                            index_col=0).T
+                if columns:
+                    out = out[columns]
             else:
                 out = func(file_name,
-                           encoding=_detect_encoding(file_name),
-                           index_col='Time')
+                           encoding=encoding,
+                           usecols=columns,
+                           index_col="Time")
 
             out.index = out.index.astype(float)
-            return out
+            # return the dataframe removing nan index values
+            return out[~np.isnan(out.index)]
 
     raise ValueError(
         f"\nNot able to read '{file_name}'. "
@@ -178,18 +199,44 @@ def assert_frames_close(actual, expected, assertion="raise",
         http://nbviewer.jupyter.org/gist/jiffyclub/ac2e7506428d5e1d587b
 
     """
-    assert (isinstance(actual, pd.DataFrame) and
-            isinstance(expected, pd.DataFrame)), \
-        'Inputs must both be pandas DataFrames.'
+    if not isinstance(actual, pd.DataFrame)\
+       or not isinstance(expected, pd.DataFrame):
+        raise TypeError('\nInputs must both be pandas DataFrames.')
+
+    expected_cols, actual_cols = set(expected.columns), set(actual.columns)
+
+    if expected_cols != actual_cols:
+        # columns are not equal
+        message = ""
+
+        if actual_cols.difference(expected_cols):
+            columns = ["'" + col + "'" for col
+                       in actual_cols.difference(expected_cols)]
+            columns = ", ".join(columns)
+            message += '\nColumns ' + columns\
+                       + ' from actual values not found in expected values.'
+
+        if expected_cols.difference(actual_cols):
+            columns = ["'" + col + "'" for col
+                       in expected_cols.difference(actual_cols)]
+            columns = ", ".join(columns)
+            message += '\nColumns ' + columns\
+                       + ' from expected values not found in actual values.'
+
+        if assertion == "raise":
+            raise ValueError(
+                '\nColumns from actual and expected values must be equal.'
+                + message)
+        else:
+            warnings.warn(message)
 
-    assert set(expected.columns) == set(actual.columns), \
-        'test set columns must be equal to those in actual/observed set.'
+    columns = actual_cols.intersection(expected_cols)
 
     assert np.all(np.equal(expected.index.values, actual.index.values)), \
         'test set and actual set must share a common index' \
         'instead found' + expected.index.values + 'vs' + actual.index.values
 
-    for col in expected.columns:
+    for col in columns:
         # if for Vensim outputs where constant values are only in the first row
         if np.isnan(expected[col].values[1:]).all():
             expected[col] = expected[col].values[0]
@@ -245,7 +292,7 @@ def assert_allclose(x, y, rtol=1.e-5, atol=1.e-5):
     assert np.all(np.less_equal(abs(x - y), atol + rtol * abs(y)))
 
 
-def _detect_encoding(filename):
+def detect_encoding(filename):
     """
     Detects the encoding of a file.
 
diff --git a/tests/unit_test_benchmarking.py b/tests/unit_test_benchmarking.py
index 106b5fe4..47cab816 100644
--- a/tests/unit_test_benchmarking.py
+++ b/tests/unit_test_benchmarking.py
@@ -87,4 +87,112 @@ def test_transposed_frame(self):
 
         assert_frames_close(
             load_outputs('data/out_teacup.csv'),
-            load_outputs('data/out_teacup_transposed.csv', transpose=True))
\ No newline at end of file
+            load_outputs('data/out_teacup_transposed.csv', transpose=True))
+
+    def test_load_columns(self):
+        from pysd.tools.benchmarking import load_outputs
+
+        out0 = load_outputs(
+            'data/out_teacup.csv')
+
+        out1 = load_outputs(
+            'data/out_teacup.csv',
+            columns=["Room Temperature", "Teacup Temperature"])
+
+        out2 = load_outputs(
+            'data/out_teacup_transposed.csv',
+            transpose=True,
+            columns=["Heat Loss to Room"])
+
+        self.assertEqual(
+            set(out1.columns),
+            set(["Room Temperature", "Teacup Temperature"]))
+
+        self.assertEqual(
+            set(out2.columns),
+            set(["Heat Loss to Room"]))
+
+        self.assertTrue((out0.index == out1.index).all())
+        self.assertTrue((out0.index == out2.index).all())
+
+    def test_different_cols(self):
+        from warnings import catch_warnings
+        from pysd.tools.benchmarking import assert_frames_close
+        import pandas as pd
+
+        d1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'd': [6, 7]})
+        d2 = pd.DataFrame({'a': [1, 2]})
+        d3 = pd.DataFrame({'a': [1, 2], 'c': [3, 4]})
+
+        with self.assertRaises(ValueError) as err:
+            assert_frames_close(
+                actual=d1,
+                expected=d2)
+
+        self.assertIn(
+            "Columns from actual and expected values must be equal.",
+            str(err.exception))
+
+        with catch_warnings(record=True) as ws:
+            assert_frames_close(
+                actual=d1,
+                expected=d2,
+                assertion="warn")
+
+            # use only user warnings
+            wu = [w for w in ws if issubclass(w.category, UserWarning)]
+            self.assertEqual(len(wu), 1)
+
+            self.assertIn("'b'", str(wu[0].message))
+            self.assertIn("'d'", str(wu[0].message))
+            self.assertIn(
+                "from actual values not found in expected values.",
+                str(wu[0].message))
+
+        with catch_warnings(record=True) as ws:
+            assert_frames_close(
+                expected=d1,
+                actual=d2,
+                assertion="warn")
+
+            # use only user warnings
+            wu = [w for w in ws if issubclass(w.category, UserWarning)]
+            self.assertEqual(len(wu), 1)
+
+            self.assertIn("'b'", str(wu[0].message))
+            self.assertIn("'d'", str(wu[0].message))
+            self.assertIn(
+                "from expected values not found in actual values.",
+                str(wu[0].message))
+
+        with catch_warnings(record=True) as ws:
+            assert_frames_close(
+                actual=d1,
+                expected=d3,
+                assertion="warn")
+
+            # use only user warnings
+            wu = [w for w in ws if issubclass(w.category, UserWarning)]
+            self.assertEqual(len(wu), 1)
+
+            self.assertIn("'b'", str(wu[0].message))
+            self.assertIn("'d'", str(wu[0].message))
+            self.assertIn(
+                "from actual values not found in expected values.",
+                str(wu[0].message))
+
+            self.assertIn(
+                "Columns 'c' from expected values not found in actual "
+                "values.", str(wu[0].message))
+
+    def test_invalid_input(self):
+        from pysd.tools.benchmarking import assert_frames_close
+
+        with self.assertRaises(TypeError) as err:
+            assert_frames_close(
+                actual=[1, 2],
+                expected=[1, 2])
+
+        self.assertIn(
+            "Inputs must both be pandas DataFrames.",
+            str(err.exception))
diff --git a/tests/unit_test_pysd.py b/tests/unit_test_pysd.py
index 78ef1403..cd3dad16 100644
--- a/tests/unit_test_pysd.py
+++ b/tests/unit_test_pysd.py
@@ -1653,7 +1653,7 @@ def test_py_model_file(self):
         import pysd
 
         model = pysd.read_vensim(test_model)
-        self.assertEqual(model.py_model_file, 
+        self.assertEqual(model.py_model_file,
                          test_model.replace(".mdl", ".py"))
 
     def test_mdl_file(self):

From a186cdfb0a16349e7f27bdb551072a414e9a3f43 Mon Sep 17 00:00:00 2001
From: Eneko Martin Martinez <eneko.martin.martinez@gmail.com>
Date: Wed, 7 Jul 2021 14:41:06 +0200
Subject: [PATCH 2/3] Remove nan values from constant variables of actual

---
 pysd/tools/benchmarking.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pysd/tools/benchmarking.py b/pysd/tools/benchmarking.py
index abaaf5ea..24ced225 100644
--- a/pysd/tools/benchmarking.py
+++ b/pysd/tools/benchmarking.py
@@ -80,9 +80,10 @@ def load_outputs(file_name, transpose=False, columns=None, encoding=None):
 
     columns: list or None (optional)
         List of the column names to load. If None loads all the columns.
-        Default is None. Note, if transpose=False, the loading will be
-        faster as only selected columns will be loaded. If transpose=True
-        the whole file must be read and it will be subselected later.
+        Default is None.
+        NOTE: if transpose=False, the loading will be faster as only
+        selected columns will be loaded. If transpose=True the whole
+        file must be read and it will be subselected later.
 
     encoding: str or None (optional)
         Encoding type to read output file. Needed if the file has special
@@ -240,6 +241,8 @@ def assert_frames_close(actual, expected, assertion="raise",
         # if for Vensim outputs where constant values are only in the first row
         if np.isnan(expected[col].values[1:]).all():
             expected[col] = expected[col].values[0]
+        if np.isnan(actual[col].values[1:]).all():
+            actual[col] = actual[col].values[0]
         try:
             assert_allclose(expected[col].values,
                             actual[col].values,

From d3d7ea0209e6379fd0fdc28a1578a873898ac008 Mon Sep 17 00:00:00 2001
From: Eneko Martin Martinez <eneko.martin.martinez@gmail.com>
Date: Thu, 8 Jul 2021 12:59:09 +0200
Subject: [PATCH 3/3] Add verbose option and vectorization

---
 pysd/tools/benchmarking.py      | 84 +++++++++++++++++++++------------
 tests/unit_test_benchmarking.py | 56 ++++++++++++++++++++++
 2 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/pysd/tools/benchmarking.py b/pysd/tools/benchmarking.py
index 24ced225..acd0ce29 100644
--- a/pysd/tools/benchmarking.py
+++ b/pysd/tools/benchmarking.py
@@ -126,7 +126,7 @@ def load_outputs(file_name, transpose=False, columns=None, encoding=None):
 
 
 def assert_frames_close(actual, expected, assertion="raise",
-                        precision=2, **kwargs):
+                        verbose=False, precision=2, **kwargs):
     """
     Compare DataFrame items by column and
     raise AssertionError if any column is not equal.
@@ -147,8 +147,13 @@ def assert_frames_close(actual, expected, assertion="raise",
         that two frames are close. Otherwise, it will show a warning
         message. Default is "raise".
 
+    verbose: bool (optional)
+        If True, if any column is not close the actual and expected values
+        will be printed in the error/warning message with the difference.
+        Default is False.
+
     precision: int (optional)
-        Precision to print the numerical values of assertion message.
+        Precision to print the numerical values of assertion verbosed message.
         Default is 2.
 
     kwargs:
@@ -172,6 +177,18 @@ def assert_frames_close(actual, expected, assertion="raise",
     Traceback (most recent call last):
     ...
     AssertionError:
+    Following columns are not close:
+    \t'0'
+
+    >>> assert_frames_close(
+    ...     pd.DataFrame(100, index=range(5), columns=range(3)),
+    ...     pd.DataFrame(150, index=range(5), columns=range(3)),
+    ...     verbose=True, rtol=.2)  # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+    ...
+    AssertionError:
+    Following columns are not close:
+    \t'0'
     Column '0' is not close.
     Expected values:
     \t[150, 150, 150, 150, 150]
@@ -186,13 +203,8 @@ def assert_frames_close(actual, expected, assertion="raise",
     ...     rtol=.2, assertion="warn")
     ...
     UserWarning:
-    Column '0' is not close.
-    Expected values:
-    \t[150, 150, 150, 150, 150]
-    Actual values:
-    \t[100, 100, 100, 100, 100]
-    Difference:
-    \t[50, 50, 50, 50, 50]
+    Following columns are not close:
+    \t'0'
 
     References
     ----------
@@ -231,25 +243,31 @@ def assert_frames_close(actual, expected, assertion="raise",
         else:
             warnings.warn(message)
 
-    columns = actual_cols.intersection(expected_cols)
+    columns = list(actual_cols.intersection(expected_cols))
 
+    # TODO let compare dataframes with different timestamps if "warn"
     assert np.all(np.equal(expected.index.values, actual.index.values)), \
         'test set and actual set must share a common index' \
         'instead found' + expected.index.values + 'vs' + actual.index.values
 
-    for col in columns:
-        # if for Vensim outputs where constant values are only in the first row
-        if np.isnan(expected[col].values[1:]).all():
-            expected[col] = expected[col].values[0]
-        if np.isnan(actual[col].values[1:]).all():
-            actual[col] = actual[col].values[0]
-        try:
-            assert_allclose(expected[col].values,
-                            actual[col].values,
-                            **kwargs)
-
-        except AssertionError:
-            assertion_details = '\n\n'\
+    # if for Vensim outputs where constant values are only in the first row
+    _remove_constant_nan(expected)
+    _remove_constant_nan(actual)
+
+    c = assert_allclose(expected[columns],
+                        actual[columns],
+                        **kwargs)
+
+    if c.all():
+        return
+
+    columns = np.array(columns, dtype=str)[~c.values]
+
+    assertion_details = "\nFollowing columns are not close:\n\t"\
+                        + ", ".join(columns)
+    if verbose:
+        for col in columns:
+            assertion_details += '\n\n'\
                 + f"Column '{col}' is not close."\
                 + '\n\nExpected values:\n\t'\
                 + np.array2string(expected[col].values,
@@ -264,12 +282,12 @@ def assert_frames_close(actual, expected, assertion="raise",
                 + np.array2string(expected[col].values-actual[col].values,
                                   precision=precision,
                                   separator=', ',
-                                  suppress_small=True)\
+                                  suppress_small=True)
 
-            if assertion == "raise":
-                raise AssertionError(assertion_details)
-            else:
-                warnings.warn(assertion_details)
+    if assertion == "raise":
+        raise AssertionError(assertion_details)
+    else:
+        warnings.warn(assertion_details)
 
 
 def assert_allclose(x, y, rtol=1.e-5, atol=1.e-5):
@@ -292,7 +310,15 @@ def assert_allclose(x, y, rtol=1.e-5, atol=1.e-5):
     None
 
     """
-    assert np.all(np.less_equal(abs(x - y), atol + rtol * abs(y)))
+    return (abs(x - y) <= atol + rtol * abs(y)).all()
+
+
+def _remove_constant_nan(df):
+    """
+    Removes nana values in constant value columns produced by Vensim
+    """
+    nan_cols = np.isnan(df.iloc[1:, :]).all()
+    df.loc[:, nan_cols] = df.loc[:, nan_cols].iloc[0].values
 
 
 def detect_encoding(filename):
diff --git a/tests/unit_test_benchmarking.py b/tests/unit_test_benchmarking.py
index 47cab816..182edbd7 100644
--- a/tests/unit_test_benchmarking.py
+++ b/tests/unit_test_benchmarking.py
@@ -44,6 +44,32 @@ def test_different_frames_error(self):
                 load_outputs('data/out_teacup.csv'),
                 load_outputs('data/out_teacup_modified.csv'))
 
+        self.assertIn(
+            "Following columns are not close:\n\tTeacup Temperature",
+            str(err.exception))
+
+        self.assertNotIn(
+            "Column 'Teacup Temperature' is not close.",
+            str(err.exception))
+
+        self.assertNotIn(
+            "Actual values:\n\t",
+            str(err.exception))
+
+        self.assertNotIn(
+            "Expected values:\n\t",
+            str(err.exception))
+
+        with self.assertRaises(AssertionError) as err:
+            assert_frames_close(
+                load_outputs('data/out_teacup.csv'),
+                load_outputs('data/out_teacup_modified.csv'),
+                verbose=True)
+
+        self.assertIn(
+            "Following columns are not close:\n\tTeacup Temperature",
+            str(err.exception))
+
         self.assertIn(
             "Column 'Teacup Temperature' is not close.",
             str(err.exception))
@@ -70,6 +96,36 @@ def test_different_frames_warning(self):
             wu = [w for w in ws if issubclass(w.category, UserWarning)]
             self.assertEqual(len(wu), 1)
 
+            self.assertIn(
+                "Following columns are not close:\n\tTeacup Temperature",
+                str(wu[0].message))
+
+            self.assertNotIn(
+                "Column 'Teacup Temperature' is not close.",
+                str(wu[0].message))
+
+            self.assertNotIn(
+                "Actual values:\n\t",
+                str(wu[0].message))
+
+            self.assertNotIn(
+                "Expected values:\n\t",
+                str(wu[0].message))
+
+        with catch_warnings(record=True) as ws:
+            assert_frames_close(
+                load_outputs('data/out_teacup.csv'),
+                load_outputs('data/out_teacup_modified.csv'),
+                assertion="warn", verbose=True)
+
+            # use only user warnings
+            wu = [w for w in ws if issubclass(w.category, UserWarning)]
+            self.assertEqual(len(wu), 1)
+
+            self.assertIn(
+                "Following columns are not close:\n\tTeacup Temperature",
+                str(wu[0].message))
+
             self.assertIn(
                 "Column 'Teacup Temperature' is not close.",
                 str(wu[0].message))