Skip to content

Commit

Permalink
Add column mapping and data reader
Browse files Browse the repository at this point in the history
  • Loading branch information
= committed Feb 22, 2024
1 parent 736cc7b commit 4e9150f
Show file tree
Hide file tree
Showing 99 changed files with 524 additions and 2 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,7 @@ control/
node_modules/

# Local notes
.notes
.notes

# Mac Desktop Services Store
*.DS_Store
35 changes: 35 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Yaml for setting up precommit hooks using:
# `pre-commit` library <https://pre-commit.com/>

exclude: '^$'

fail_fast: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0
hooks:
- id: trailing-whitespace
exclude: '[snap|json]$'
- id: end-of-file-fixer
- id: check-ast
- id: debug-statements

- repo: https://github.com/python/black
rev: 20.8b1
hooks:
- id: black
language_version: python
additional_dependencies: ['click==8.0.4']

- repo: https://github.com/pre-commit/mirrors-pylint
rev: v2.7.2
hooks:
- id: pylint
language: python
args: [
'--disable=R,wrong-import-position,fixme',
]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.961
hooks:
- id: mypy
62 changes: 62 additions & 0 deletions nad_ch/application/data_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from geopandas import GeoDataFrame, read_file
import fiona
from typing import Optional
import yaml
import os


class DataReader(object):
def __init__(self, config_name: Optional[str] = None) -> None:
self.config_name = config_name
self.default_config_path = "nad_ch/application/nad_column_maps/default.yaml"
self.column_map = self.read_column_map()

def read_column_map(self) -> dict[any]:
custom_config_path = (
f"nad_ch/application/nad_column_maps/{self.config_name}.yaml"
)
with open(self.default_config_path, "r") as file:
default_config = yaml.safe_load(file)
if not os.path.exists(custom_config_path):
column_map_config = default_config
else:
with open(custom_config_path, "r") as file:
column_map_config = yaml.safe_load(file)
column_map_config["data_required_fields"] = default_config.get(
"data_required_fields"
)
return column_map_config

def rename_columns(self, gdf: GeoDataFrame) -> GeoDataFrame:
column_map = self.column_map["data_column_mapping"]
original_names = {col.lower(): col for col in gdf.columns}
valid_renames = {}
for nad_column, fields_to_check in column_map.items():
orig_matched_name = original_names.get(nad_column.lower())
if orig_matched_name:
valid_renames[orig_matched_name] = nad_column
continue
for field in fields_to_check:
orig_matched_name = original_names.get(field.lower())
if orig_matched_name:
valid_renames[orig_matched_name] = nad_column
break
gdf = gdf.rename(columns=valid_renames)
return gdf[[col for col in valid_renames.values()]]

def read_file_in_batches(
self, path: str, table_name: Optional[str] = None, batch_size: int = 100000
) -> GeoDataFrame:
# TODO: Modify to return a joined table; for cases where 1 or more tables
# are needed to get all fields from source file.
layers = fiona.listlayers(path)
if table_name and table_name not in layers:
raise Exception(f"Table name {table_name} does not exist")
i = 0
while True:
gdf = read_file(path, rows=slice(i, i + batch_size))
if gdf.shape[0] == 0:
break
gdf = self.rename_columns(gdf)
yield gdf
i += batch_size
119 changes: 119 additions & 0 deletions nad_ch/application/nad_column_maps/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
data_required_fields:
- Add_Number
- AddNo_Full
- St_Name
- StNam_Full
- County
- Inc_Muni
- Post_City
- State
- UUID
- AddAuth
- Longitude
- Latitude
- NatGrid
- Placement
- AddrPoint
- DateUpdate
- NAD_Source
- DataSet_ID
data_column_mapping:
AddNum_Pre:
- ANUMBERPRE
Add_Number:
- ANUMBER
AddNum_Suf:
- ANUMBERSUF
AddNo_Full:
- ADR_NUM_COMP
St_PreMod:
- ST_PRE_MOD
St_PreDir:
- ST_PRE_DIR
St_PreTyp:
- ST_PRE_TYP
St_PreSep:
- ST_PRE_SEP
St_Name:
St_PosTyp:
- ST_POS_TYP
St_PosDir:
- ST_POS_DIR
St_PosMod:
- ST_POS_MOD
StNam_Full:
- ST_FULNAM
Building:
Floor:
Unit:
Room:
Seat:
Addtl_Loc:
SubAddress:
LandmkName:
- LANDMARK
County:
- CO_NAME
Inc_Muni:
Post_City:
- Post_Comm
- POSTCOMM
Census_Plc:
Uninc_Comm:
Nbrhd_Comm:
NatAmArea:
NatAmSub:
Urbnztn_PR:
PlaceOther:
State:
Zip_Code:
- Post_Code
- ZIP
Plus_4:
- Post_Code4
- ZIP4
UUID:
- GlobalID
AddAuth:
- DiscrpAgID
- AAUTHORITY
AddrRefSys:
Longitude:
- Long
- LONGITUDE
Latitude:
- Lat
- LATITUDE
NatGrid:
- USNG_CODE
Elevation:
- Elev
Placement:
- PLACE_LOC
AddrPoint:
Related_ID:
RelateType:
ParcelSrc:
Parcel_ID:
- STATE_PIN
AddrClass:
Lifecycle:
- STATUS
Effective:
- EFF_DATE
Expire:
- RET_DATE
DateUpdate:
- EDIT_DATE
AnomStatus:
- VERROR_911
LocatnDesc:
- LOC_DESC
Addr_Type:
- Place_Type
PlaceNmTyp:
DeliverTyp:
NAD_Source:
DataSet_ID:
- Site_NGUID
- ADD_ID
47 changes: 47 additions & 0 deletions nad_ch/application/nad_column_maps/testprovider1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
data_column_mapping:
COL_0:
- ID
COL_1:
- STCOFIPS
COL_10:
- HISPPOP
COL_11:
- AMERIND
COL_12:
- ASIAN
COL_13:
- PACIFIC
COL_14:
- RACE2UP
COL_15:
- OTHRACE
COL_16:
- LASTUPDATE
COL_17:
- LASTEDITOR
COL_18:
- AGEMAJOR
COL_19:
- AREASQMETER
COL_2:
- TRACT
COL_20:
- Shape_Length
COL_21:
- Shape_Area
COL_22:
- geometry
COL_3:
- STFID
COL_4:
- BLOCK
COL_5:
- TOTPOP
COL_6:
- POPDENS
COL_7:
- RACEBASE
COL_8:
- WHITE
COL_9:
- BLACK
11 changes: 11 additions & 0 deletions nad_ch/application/nad_column_maps/testprovider2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
data_column_mapping:
COL_0:
- NAME
COL_1:
- ST
COL_2:
- ZIP
COL_3:
- RuleID
COL_4:
- geometry
3 changes: 3 additions & 0 deletions nad_ch/application/use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ def validate_data_submission(ctx: ApplicationContext, filename: str):
ctx.logger.error("Data extration error")
return

# data_producer = submission.producer
# config_name = f"{data_producer.name}_{data_producer.id}"
# TODO: Incorporate config
report = ctx.task_queue.run_load_and_validate(
ctx.submissions, submission.id, download_result.extracted_dir
)
Expand Down
2 changes: 2 additions & 0 deletions nad_ch/infrastructure/task_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
report_to_dict,
report_from_dict,
)
from typing import Optional
from nad_ch.application.data_reader import DataReader
from nad_ch.application.interfaces import TaskQueue
from nad_ch.application.validation import get_feature_count, get_feature_details
from nad_ch.config import QUEUE_BROKER_URL, QUEUE_BACKEND_URL
Expand Down
62 changes: 61 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pandas = "^2.2.0"
geopandas = "^0.14.2"
pyarrow = "^15.0.0"
shapely = "^2.0.2"
pyyaml = "^6.0.1"

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.2"
Expand Down
Loading

0 comments on commit 4e9150f

Please sign in to comment.