diff --git a/nad_ch/application/data_reader.py b/nad_ch/application/data_reader.py index 434ff7e..a4de291 100644 --- a/nad_ch/application/data_reader.py +++ b/nad_ch/application/data_reader.py @@ -27,6 +27,26 @@ def read_column_map(self) -> dict[any]: ) return column_map_config + def validate_column_map(self): + column_map = self.column_map["data_column_mapping"] + column_map_reverse = {} + + for key, values in column_map.items(): + for value in values: + value_lcase = value.lower() + if value_lcase in column_map_reverse: + column_map_reverse[value_lcase].append(key) + else: + column_map_reverse[value_lcase] = [key] + duplicates = {k: v for k, v in column_map_reverse.items() if len(v) > 1} + if duplicates: + duplicate_nad_fields = ", ".join( + [" & ".join(nad_fields) for nad_fields in list(duplicates.values())] + ) + raise Exception( + f"Duplicate inputs found for destination fields: {duplicate_nad_fields}" + ) + def rename_columns(self, gdf: GeoDataFrame) -> GeoDataFrame: column_map = self.column_map["data_column_mapping"] original_names = {col.lower(): col for col in gdf.columns} diff --git a/nad_ch/application/nad_column_maps/testprovider1.yaml b/nad_ch/application/nad_column_maps/testprovider1.yaml index 830ce1a..51ae22d 100644 --- a/nad_ch/application/nad_column_maps/testprovider1.yaml +++ b/nad_ch/application/nad_column_maps/testprovider1.yaml @@ -25,6 +25,7 @@ data_column_mapping: - AREASQMETER COL_2: - TRACT + - Pacific COL_20: - Shape_Length COL_21: @@ -39,6 +40,7 @@ data_column_mapping: - TOTPOP COL_6: - POPDENS + - totPop COL_7: - RACEBASE COL_8: diff --git a/tests/application/test_data_reader.py b/tests/application/test_data_reader.py index c27c49d..92b46b8 100644 --- a/tests/application/test_data_reader.py +++ b/tests/application/test_data_reader.py @@ -6,6 +6,7 @@ ) import pickle from pandas.testing import assert_frame_equal +import pytest TEST_DATA_DIR = "tests/test_data" @@ -36,6 +37,19 @@ def test_read_column_map(): ) +def test_validate_column_map(): + with pytest.raises(Exception) as exc: + reader = DataReader("testprovider1") + reader.validate_column_map() + assert ( + str(exc.value) + == "Duplicate inputs found for destination fields: COL_13 & COL_2, COL_5 & COL_6" + ) + + reader = DataReader("testprovider2") + reader.validate_column_map() + + def test_read_file_in_batches_shape(): file_path = os.path.join( TEST_DATA_DIR, "shapefiles/usa-major-cities/usa-major-cities.shp" diff --git a/tests/test_data/config_baselines.py b/tests/test_data/config_baselines.py index db003ec..e0d5b4e 100644 --- a/tests/test_data/config_baselines.py +++ b/tests/test_data/config_baselines.py @@ -96,14 +96,14 @@ "COL_17": ["LASTEDITOR"], "COL_18": ["AGEMAJOR"], "COL_19": ["AREASQMETER"], - "COL_2": ["TRACT"], + "COL_2": ["TRACT", "Pacific"], "COL_20": ["Shape_Length"], "COL_21": ["Shape_Area"], "COL_22": ["geometry"], "COL_3": ["STFID"], "COL_4": ["BLOCK"], "COL_5": ["TOTPOP"], - "COL_6": ["POPDENS"], + "COL_6": ["POPDENS", "totPop"], "COL_7": ["RACEBASE"], "COL_8": ["WHITE"], "COL_9": ["BLACK"],