From 010fe9f84f82ef0ba371092246ff06c127aee42d Mon Sep 17 00:00:00 2001 From: larsevj Date: Wed, 9 Oct 2024 16:54:20 +0200 Subject: [PATCH] Better handling of whitespace in param names --- src/ert/config/design_matrix.py | 76 ++++++++----------- .../test_design_matrix.py | 61 ++++++++++++--- 2 files changed, 84 insertions(+), 53 deletions(-) diff --git a/src/ert/config/design_matrix.py b/src/ert/config/design_matrix.py index f8a3b44e671..8b22e622a6d 100644 --- a/src/ert/config/design_matrix.py +++ b/src/ert/config/design_matrix.py @@ -80,18 +80,22 @@ def read_design_matrix( """ Reads out all file content from different files and create dataframes """ - param_names = pd.read_excel( - self.xls_filename, - sheet_name=self.design_sheet, - nrows=1, - header=None, - dtype=str, - ).iloc[0] + param_names = ( + pd.read_excel( + self.xls_filename, + sheet_name=self.design_sheet, + nrows=1, + header=None, + dtype="string", + ) + .iloc[0] + .apply(lambda x: x.strip() if isinstance(x, str) else x) + ) if len(param_names) - len(set(param_names)) != 0: raise ValueError("Duplicate parameter names found in design sheet") design_matrix_df = DesignMatrix._read_excel( self.xls_filename, self.design_sheet - ) + ).rename(columns=lambda x: str(x).strip()) if "REAL" in design_matrix_df.columns: if not is_integer_dtype(design_matrix_df.dtypes["REAL"]) or any( @@ -148,6 +152,7 @@ def _read_excel( sheet_name: str, usecols: int | list[int] | None = None, header: int | None = 0, + dtype: str | None = None, ) -> pd.DataFrame: """ Make dataframe from excel file @@ -160,6 +165,7 @@ def _read_excel( sheet_name, usecols=usecols, header=header, + dtype=dtype, ) return dframe.dropna(axis=1, how="all") @@ -171,28 +177,17 @@ def _validate_design_matrix(design_matrix: pd.DataFrame) -> list[str]: if design_matrix.empty: return [] errors = [] - try: - unnamed = design_matrix.loc[ - :, design_matrix.columns.str.contains("^Unnamed") - ] - except ValueError as err: - # We catch because int/floats as column headers - # in xlsx gets read as int/float and is not valid to index by. - errors.append(f"Invalid value in design matrix header, error: {err !s}") - else: - column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()] - if len(column_indexes) > 0: - errors.append(f"Column headers not present in column {column_indexes}") - - # Look for initial or trailing whitespace in column headers. This - # is disallowed as it can create user confusion and has no use-case. - for col_header in design_matrix: - if col_header != col_header.strip(): - errors.append( - ( - f"Column header '{col_header}' contains initial or trailing whitespace." - ) - ) + column_indexes_unnamed = [ + index + for index, value in enumerate( + design_matrix.columns.str.contains("^Unnamed") + ) + if value + ] + if len(column_indexes_unnamed) > 0: + errors.append( + f"Column headers not present in column {column_indexes_unnamed}" + ) empties = [ f"Realization {design_matrix.index[i]}, column {design_matrix.columns[j]}" @@ -215,7 +210,10 @@ def _read_defaultssheet( :raises: ValueError if defaults sheet is non-empty but non-parsable """ default_df = DesignMatrix._read_excel( - xlsfilename, defaultssheetname, usecols=[0, 1], header=None + xlsfilename, + defaultssheetname, + header=None, + dtype="string", ) if default_df.empty: return {} @@ -225,19 +223,11 @@ def _read_defaultssheet( f"Row {default_df.index[i]}, column {default_df.columns[j]}" for i, j in zip(*np.where(pd.isna(default_df))) ] - if empty_cells > 0: + if len(empty_cells) > 0: raise ValueError(f"Default sheet contains empty cells {empty_cells}") - # Look for initial or trailing whitespace in parameter names. This - # is disallowed as it can create user confusion and has no use-case. - whitespace_errors = [] - for paramname in default_df.loc[:, 0]: - if paramname != paramname.strip(): - whitespace_errors.append( - f"Parameter name '{paramname}' in default values contains " - "initial or trailing whitespace." - ) - if whitespace_errors > 0: - raise ValueError(whitespace_errors) + default_df[0] = default_df[0].apply(lambda x: x.strip()) + if not default_df[0].is_unique: + raise ValueError("Default sheet contains duplicate parameter names") return {row[0]: convert_to_numeric(row[1]) for _, row in default_df.iterrows()} diff --git a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py index a5c698d25f6..334555cc73a 100644 --- a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py +++ b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py @@ -70,21 +70,16 @@ def test_reading_design_matrix_validate_reals(tmp_path, real_column, error_msg): "Duplicate parameter names found in design sheet", id="duplicate entries", ), + pytest.param( + ["a ", "b", " a"], + "Duplicate parameter names found in design sheet", + id="duplicate entries with whitespaces", + ), pytest.param( ["a", "b ", ""], r"Column headers not present in column \[2\]", id="missing entries", ), - pytest.param( - ["a", "b", 10], - "Invalid value in design matrix header, error: Cannot mask with non-boolean array containing NA / NaN values", - id="float entries", - ), - pytest.param( - ["a", "b", " som "], - r"Column header ' som ' contains initial or trailing whitespace.", - id="float entries", - ), ], ) def test_reading_design_matrix_validate_headers(tmp_path, column_names, error_msg): @@ -137,3 +132,49 @@ def test_reading_design_matrix_validate_cells(tmp_path, values, error_msg): design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues") with pytest.raises(ValueError, match=error_msg): design_matrix.read_design_matrix() + + +@pytest.mark.parametrize( + "data, error_msg", + [ + pytest.param( + [["one"], ["b"], ["d"]], + "Defaults sheet must have at least two columns", + id="Too few columns", + ), + pytest.param( + [["one", 1], ["b", ""], ["d", 6]], + r"Default sheet contains empty cells \['Row 1, column 1'\]", + id="empty cells", + ), + pytest.param( + [[2, 1], ["b", ""], ["d", 6]], + r"Default sheet contains empty cells \['Row 1, column 1'\]", + id="numerical entries as param names", + ), + pytest.param( + [[" a", 1], ["a ", "some"], ["d", 6]], + r"Default sheet contains duplicate parameter names", + id="duplicate parameter names", + ), + ], +) +def test_reading_default_sheet_validation(tmp_path, data, error_msg): + design_path = tmp_path / "design_matrix.xlsx" + design_matrix_df = pd.DataFrame( + { + "REAL": [0, 1, 2], + "a": [1, 2, 3], + "b": [0, 2, 0], + "c": [3, 1, 3], + } + ) + default_sheet_df = pd.DataFrame(data) + with pd.ExcelWriter(design_path) as xl_write: + design_matrix_df.to_excel(xl_write, index=False, sheet_name="DesignSheet01") + default_sheet_df.to_excel( + xl_write, index=False, sheet_name="DefaultValues", header=False + ) + design_matrix = DesignMatrix(design_path, "DesignSheet01", "DefaultValues") + with pytest.raises(ValueError, match=error_msg): + design_matrix.read_design_matrix()