Add column mapping and data reader

GSA-TTS · Feb 22, 2024 · 4e9150f · 4e9150f
1 parent 736cc7b
commit 4e9150f
Show file tree

Hide file tree

Showing 99 changed files with 524 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -175,4 +175,7 @@ control/
 node_modules/
 
 # Local notes
-.notes
+.notes
+
+# Mac Desktop Services Store
+*.DS_Store
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,35 @@
+# Yaml for setting up precommit hooks using:
+# `pre-commit` library <https://pre-commit.com/>
+
+exclude: '^$'
+
+fail_fast: false
+repos:
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v3.4.0
+      hooks:
+          - id: trailing-whitespace
+            exclude: '[snap|json]$'
+          - id: end-of-file-fixer
+          - id: check-ast
+          - id: debug-statements
+
+    - repo: https://github.com/python/black
+      rev: 20.8b1
+      hooks:
+          - id: black
+            language_version: python
+            additional_dependencies: ['click==8.0.4']
+
+    - repo: https://github.com/pre-commit/mirrors-pylint
+      rev: v2.7.2
+      hooks:
+          - id: pylint
+            language: python
+            args: [
+                    '--disable=R,wrong-import-position,fixme',
+                ]
+    - repo: https://github.com/pre-commit/mirrors-mypy
+      rev: v0.961
+      hooks:
+          - id: mypy
diff --git a/nad_ch/application/data_reader.py b/nad_ch/application/data_reader.py
@@ -0,0 +1,62 @@
+from geopandas import GeoDataFrame, read_file
+import fiona
+from typing import Optional
+import yaml
+import os
+
+
+class DataReader(object):
+    def __init__(self, config_name: Optional[str] = None) -> None:
+        self.config_name = config_name
+        self.default_config_path = "nad_ch/application/nad_column_maps/default.yaml"
+        self.column_map = self.read_column_map()
+
+    def read_column_map(self) -> dict[any]:
+        custom_config_path = (
+            f"nad_ch/application/nad_column_maps/{self.config_name}.yaml"
+        )
+        with open(self.default_config_path, "r") as file:
+            default_config = yaml.safe_load(file)
+        if not os.path.exists(custom_config_path):
+            column_map_config = default_config
+        else:
+            with open(custom_config_path, "r") as file:
+                column_map_config = yaml.safe_load(file)
+                column_map_config["data_required_fields"] = default_config.get(
+                    "data_required_fields"
+                )
+        return column_map_config
+
+    def rename_columns(self, gdf: GeoDataFrame) -> GeoDataFrame:
+        column_map = self.column_map["data_column_mapping"]
+        original_names = {col.lower(): col for col in gdf.columns}
+        valid_renames = {}
+        for nad_column, fields_to_check in column_map.items():
+            orig_matched_name = original_names.get(nad_column.lower())
+            if orig_matched_name:
+                valid_renames[orig_matched_name] = nad_column
+                continue
+            for field in fields_to_check:
+                orig_matched_name = original_names.get(field.lower())
+                if orig_matched_name:
+                    valid_renames[orig_matched_name] = nad_column
+                    break
+        gdf = gdf.rename(columns=valid_renames)
+        return gdf[[col for col in valid_renames.values()]]
+
+    def read_file_in_batches(
+        self, path: str, table_name: Optional[str] = None, batch_size: int = 100000
+    ) -> GeoDataFrame:
+        # TODO: Modify to return a joined table; for cases where 1 or more tables
+        # are needed to get all fields from source file.
+        layers = fiona.listlayers(path)
+        if table_name and table_name not in layers:
+            raise Exception(f"Table name {table_name} does not exist")
+        i = 0
+        while True:
+            gdf = read_file(path, rows=slice(i, i + batch_size))
+            if gdf.shape[0] == 0:
+                break
+            gdf = self.rename_columns(gdf)
+            yield gdf
+            i += batch_size
diff --git a/nad_ch/application/nad_column_maps/default.yaml b/nad_ch/application/nad_column_maps/default.yaml
@@ -0,0 +1,119 @@
+data_required_fields:
+  - Add_Number
+  - AddNo_Full
+  - St_Name
+  - StNam_Full
+  - County
+  - Inc_Muni
+  - Post_City
+  - State
+  - UUID
+  - AddAuth
+  - Longitude
+  - Latitude
+  - NatGrid
+  - Placement
+  - AddrPoint
+  - DateUpdate
+  - NAD_Source
+  - DataSet_ID
+data_column_mapping:
+  AddNum_Pre:
+    - ANUMBERPRE
+  Add_Number:
+    - ANUMBER
+  AddNum_Suf:
+    - ANUMBERSUF
+  AddNo_Full:
+    - ADR_NUM_COMP
+  St_PreMod:
+    - ST_PRE_MOD
+  St_PreDir:
+    - ST_PRE_DIR
+  St_PreTyp:
+    - ST_PRE_TYP
+  St_PreSep:
+    - ST_PRE_SEP
+  St_Name:
+  St_PosTyp:
+    - ST_POS_TYP
+  St_PosDir:
+    - ST_POS_DIR
+  St_PosMod:
+    - ST_POS_MOD
+  StNam_Full:
+    - ST_FULNAM
+  Building:
+  Floor:
+  Unit:
+  Room:
+  Seat:
+  Addtl_Loc:
+  SubAddress:
+  LandmkName:
+    - LANDMARK
+  County:
+    - CO_NAME
+  Inc_Muni:
+  Post_City:
+    - Post_Comm
+    - POSTCOMM
+  Census_Plc:
+  Uninc_Comm:
+  Nbrhd_Comm:
+  NatAmArea:
+  NatAmSub:
+  Urbnztn_PR:
+  PlaceOther:
+  State:
+  Zip_Code:
+    - Post_Code
+    - ZIP
+  Plus_4:
+    - Post_Code4
+    - ZIP4
+  UUID:
+    - GlobalID
+  AddAuth:
+    - DiscrpAgID
+    - AAUTHORITY
+  AddrRefSys:
+  Longitude:
+    - Long
+    - LONGITUDE
+  Latitude:
+    - Lat
+    - LATITUDE
+  NatGrid:
+    - USNG_CODE
+  Elevation:
+    - Elev
+  Placement:
+    - PLACE_LOC
+  AddrPoint:
+  Related_ID:
+  RelateType:
+  ParcelSrc:
+  Parcel_ID:
+    - STATE_PIN
+  AddrClass:
+  Lifecycle:
+    - STATUS
+  Effective:
+    - EFF_DATE
+  Expire:
+    - RET_DATE
+  DateUpdate:
+    - EDIT_DATE
+  AnomStatus:
+    - VERROR_911
+  LocatnDesc:
+    - LOC_DESC
+  Addr_Type:
+    - Place_Type
+  PlaceNmTyp:
+  DeliverTyp:
+  NAD_Source:
+  DataSet_ID:
+    - Site_NGUID
+    - ADD_ID
diff --git a/nad_ch/application/nad_column_maps/testprovider1.yaml b/nad_ch/application/nad_column_maps/testprovider1.yaml
@@ -0,0 +1,47 @@
+data_column_mapping:
+  COL_0:
+    - ID
+  COL_1:
+    - STCOFIPS
+  COL_10:
+    - HISPPOP
+  COL_11:
+    - AMERIND
+  COL_12:
+    - ASIAN
+  COL_13:
+    - PACIFIC
+  COL_14:
+    - RACE2UP
+  COL_15:
+    - OTHRACE
+  COL_16:
+    - LASTUPDATE
+  COL_17:
+    - LASTEDITOR
+  COL_18:
+    - AGEMAJOR
+  COL_19:
+    - AREASQMETER
+  COL_2:
+    - TRACT
+  COL_20:
+    - Shape_Length
+  COL_21:
+    - Shape_Area
+  COL_22:
+    - geometry
+  COL_3:
+    - STFID
+  COL_4:
+    - BLOCK
+  COL_5:
+    - TOTPOP
+  COL_6:
+    - POPDENS
+  COL_7:
+    - RACEBASE
+  COL_8:
+    - WHITE
+  COL_9:
+    - BLACK
diff --git a/nad_ch/application/nad_column_maps/testprovider2.yaml b/nad_ch/application/nad_column_maps/testprovider2.yaml
@@ -0,0 +1,11 @@
+data_column_mapping:
+  COL_0:
+    - NAME
+  COL_1:
+    - ST
+  COL_2:
+    - ZIP
+  COL_3:
+    - RuleID
+  COL_4:
+    - geometry
diff --git a/nad_ch/application/use_cases.py b/nad_ch/application/use_cases.py
@@ -107,6 +107,9 @@ def validate_data_submission(ctx: ApplicationContext, filename: str):
         ctx.logger.error("Data extration error")
         return
 
+    # data_producer = submission.producer
+    # config_name = f"{data_producer.name}_{data_producer.id}"
+    # TODO: Incorporate config
     report = ctx.task_queue.run_load_and_validate(
         ctx.submissions, submission.id, download_result.extracted_dir
     )

diff --git a/nad_ch/infrastructure/task_queue.py b/nad_ch/infrastructure/task_queue.py
@@ -6,6 +6,8 @@
     report_to_dict,
     report_from_dict,
 )
+from typing import Optional
+from nad_ch.application.data_reader import DataReader
 from nad_ch.application.interfaces import TaskQueue
 from nad_ch.application.validation import get_feature_count, get_feature_details
 from nad_ch.config import QUEUE_BROKER_URL, QUEUE_BACKEND_URL

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ pandas = "^2.2.0"
 geopandas = "^0.14.2"
 pyarrow = "^15.0.0"
 shapely = "^2.0.2"
+pyyaml = "^6.0.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.2"